lsmpp commited on Oct 16, 2025

Commit

8934dae

verified ·

1 Parent(s): 613726b

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +177 -0
.venv/lib/python3.12/site-packages/torch/_decomp/__pycache__/__init__.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/torch/_decomp/__pycache__/decompositions_for_rng.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/torch/_dispatch/__pycache__/__init__.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/torch/_dispatch/__pycache__/python.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/torch/_inductor/__pycache__/__init__.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/torch/_inductor/__pycache__/config.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/torch/_inductor/__pycache__/cudagraph_utils.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/torch/_inductor/__pycache__/custom_graph_pass.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/torch/_inductor/__pycache__/standalone_compile.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/torch/_inductor/__pycache__/test_operators.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/torch/_inductor/autoheuristic/__init__.py +0 -0
.venv/lib/python3.12/site-packages/torch/_inductor/autoheuristic/artifacts/_MMRankingA100.py +296 -0
.venv/lib/python3.12/site-packages/torch/_inductor/autoheuristic/artifacts/_MMRankingH100.py +321 -0
.venv/lib/python3.12/site-packages/torch/_inductor/autoheuristic/artifacts/_MixedMMA100.py +150 -0
.venv/lib/python3.12/site-packages/torch/_inductor/autoheuristic/artifacts/_MixedMMH100.py +149 -0
.venv/lib/python3.12/site-packages/torch/_inductor/autoheuristic/artifacts/_PadMMA100.py +109 -0
.venv/lib/python3.12/site-packages/torch/_inductor/autoheuristic/artifacts/__init__.py +0 -0
.venv/lib/python3.12/site-packages/torch/_inductor/autoheuristic/autoheuristic.py +315 -0
.venv/lib/python3.12/site-packages/torch/_inductor/autoheuristic/autoheuristic_utils.py +339 -0
.venv/lib/python3.12/site-packages/torch/_inductor/autoheuristic/learned_heuristic_controller.py +119 -0
.venv/lib/python3.12/site-packages/torch/_inductor/autoheuristic/learnedheuristic_interface.py +95 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/__init__.py +0 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/aoti_hipify_utils.py +31 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/aoti_runtime/interface.cpp +443 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/block_analysis.py +175 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/common.py +2691 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp.py +0 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_bmm_template.py +262 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_flex_attention_template.py +1081 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_gemm_template.py +1777 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_grouped_gemm_template.py +500 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_micro_gemm.py +2011 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_template.py +138 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_template_kernel.py +597 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_utils.py +776 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_wrapper_cpu.py +0 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py +878 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_wrapper_gpu.py +717 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_wrapper_mps.py +99 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpu_device_op_overrides.py +27 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cuda/__init__.py +0 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py +293 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cuda/cuda_env.py +45 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cuda/cuda_kernel.py +674 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cuda/cuda_template.py +318 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cuda/cutlass_cache.py +105 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cuda/cutlass_lib_extensions/__init__.py +0 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py +240 -0
.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cuda/cutlass_lib_extensions/gemm_operation_extensions.py +411 -0

.gitattributes CHANGED Viewed

@@ -1825,3 +1825,180 @@ illustrious_generated/e3bdc40b2136.png filter=lfs diff=lfs merge=lfs -text
 illustrious_generated/6ea1f2330bb2.png filter=lfs diff=lfs merge=lfs -text
 illustrious_generated/a08733bfce6d.png filter=lfs diff=lfs merge=lfs -text
 illustrious_generated/0aec7eb07d4f.png filter=lfs diff=lfs merge=lfs -text

 illustrious_generated/6ea1f2330bb2.png filter=lfs diff=lfs merge=lfs -text
 illustrious_generated/a08733bfce6d.png filter=lfs diff=lfs merge=lfs -text
 illustrious_generated/0aec7eb07d4f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/699b6be34d1c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b8fe03330562.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/efb6fe0239e2.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a00e6e3ca42d.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/4c39593e352f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/daf79f3a5a47.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b66afd053661.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8a594acaada9.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/532b5ad4f3b4.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/aa2403d0dbde.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/ffa981380c46.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/60f785243e60.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/776bf5e2d2a1.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/5ab3056565d7.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/bde193a32ad5.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/58b9009a8433.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/db68dd8cb2cf.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/79a4d58eda5c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/6c2bec98397a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/56b23fd84d9e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e38a1e2e958f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/4014bca6d153.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/754db85fe85f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/0a4d2753b7c7.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a0e55ddbf74c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c472bc9db7e5.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/07f25b42b8d7.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a317dff0fe2e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/1ca2c247cfef.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/92e013a48101.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/53d7fbefadd0.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/0d1561551252.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/13d5cc66d420.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7429db9846f3.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/49222c07a03f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e526540fdbb8.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/0e42a3a7d296.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/29259007fbbf.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e189b3abf6d2.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/4af1ebb4488c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/1755674c7bc1.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/9a7e94351f39.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/548d5f09851f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8d3200b9e787.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a3a89fac4fcf.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e4bf1a6be7c6.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/37c414606b7d.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c4033b9e6115.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/29c45b6176ff.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e729db57f47c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/96a62fa83b20.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/6a51d82ae8bc.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/4632a958366a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/47aa86a18b5a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/d1995ff196e3.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e5ef86680557.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/bd545030c487.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/cef6d8bbd520.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/6b23dba7be7d.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8d3ab41288cb.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e31935899f08.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/bc6707dcf1f2.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b3a626cfe029.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/3503bc2ba8ab.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/ce38aa6d180d.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/6035404a25b1.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/9b1bb21a46e8.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a4eb52675c75.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a7acf95bd41f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c636dce88f49.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8ef3560e5a42.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/4f7953e3c61d.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a9d16660f9be.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/5028ea7283a1.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/259e79d12cba.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/aa08fc4ab5f7.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a048f190d616.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/64d5e838ff32.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f8b258d2d426.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/73e5f02dca91.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/176f37a6cd7f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/75bc8bf6da61.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/831ac5f05b4e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/6061d4cce84a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/1970ccc17731.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/46d7a6324536.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/593a5b377fb6.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/9e588e17280b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f086d40d76b7.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e5b4630753fb.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/bf19c6825d9d.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/ad90d731a0fa.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/4c7a209b8887.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b52834e10fd4.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e0fa2a6e6b42.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b770edd1abe0.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/bf8fcd7e3ebb.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b2a469095584.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/4af582043c66.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/666f1a84cb80.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c87663be02a7.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/5fa4fb280d76.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c65bd94bf971.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a3f618970fb7.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c588cc386611.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e412e3f1af16.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/acf002111806.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/3dbcf74b53f1.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/149a82673fe9.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/4536122bec42.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/25df61f584ba.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/5803e88c45f3.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/d5f24e4f2554.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/57264915c81a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b5a6eaf580ce.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/18f1f90221cc.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/798df3adbd88.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/617ce74ffafa.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/1dfed5dbd442.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/243fd5bd7679.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/2433395d6d43.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/96ab8498c7a6.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/4a574003bbc9.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/d209e317052f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/fb9ed29c63d7.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/ece3750ad6a6.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/690e7719d4bf.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/639cf9531d64.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/56e66190e7ce.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/ceb937e0801e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/ed0e6ab9592e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/312f15890343.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/d21efb5f15d1.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b01a98afee9a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/3cbd0d9a36a7.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/6994acf3db4f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b5ee5bce0095.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e150c196c1de.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/d0d5e2ddb402.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b102a265b15d.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b0202f93d233.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/14cde88e8c02.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/30f0d4a3b6b6.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/33aeb710e683.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/71485a12c097.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/2c3ca96d94bc.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a630cf5674f3.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/163359d65694.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/2f054f26c97c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/af2a10d42166.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/4d7d6abe842c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/62fb4f26c8d0.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8bc741c00933.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f3021039e6df.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7ed913ac6954.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a9ab451bd4c9.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/49c90b238c77.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/1371e80e6f16.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f284fb62c9bf.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/71e9866c5494.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/abf601407cb0.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/83da1d18e129.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/484afd596fd0.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/4c0f1f03dcd2.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/2008ee2e6f66.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/48fcc54ce758.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/4769b078890d.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b716f3a23f11.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8e740eadaf18.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/88e259bbe761.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/476617799f8d.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/1bb92008067a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/38cdd5aae9bb.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/5e8555d59ee2.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f3f41fd3689e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/ca3147d568a3.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8e375573fe30.png filter=lfs diff=lfs merge=lfs -text

.venv/lib/python3.12/site-packages/torch/_decomp/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (28.2 kB). View file

.venv/lib/python3.12/site-packages/torch/_decomp/__pycache__/decompositions_for_rng.cpython-312.pyc ADDED Viewed

Binary file (12.5 kB). View file

.venv/lib/python3.12/site-packages/torch/_dispatch/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (192 Bytes). View file

.venv/lib/python3.12/site-packages/torch/_dispatch/__pycache__/python.cpython-312.pyc ADDED Viewed

Binary file (11.5 kB). View file

.venv/lib/python3.12/site-packages/torch/_inductor/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (14 kB). View file

.venv/lib/python3.12/site-packages/torch/_inductor/__pycache__/config.cpython-312.pyc ADDED Viewed

Binary file (41.4 kB). View file

.venv/lib/python3.12/site-packages/torch/_inductor/__pycache__/cudagraph_utils.cpython-312.pyc ADDED Viewed

Binary file (17.5 kB). View file

.venv/lib/python3.12/site-packages/torch/_inductor/__pycache__/custom_graph_pass.cpython-312.pyc ADDED Viewed

Binary file (5.43 kB). View file

.venv/lib/python3.12/site-packages/torch/_inductor/__pycache__/standalone_compile.cpython-312.pyc ADDED Viewed

Binary file (12 kB). View file

.venv/lib/python3.12/site-packages/torch/_inductor/__pycache__/test_operators.cpython-312.pyc ADDED Viewed

Binary file (2.05 kB). View file

.venv/lib/python3.12/site-packages/torch/_inductor/autoheuristic/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.12/site-packages/torch/_inductor/autoheuristic/artifacts/_MMRankingA100.py ADDED Viewed

	@@ -0,0 +1,296 @@

+# flake8: noqa: B950
+# fmt: off
+# This file was generated by AutoHeuristic. Do not modify it manually!
+# To regenerate this file, take a look at the steps in the README.md file inside torchgen/_autoheuristic/mm/
+from typing import List, Optional, Tuple
+from torch._inductor.autoheuristic.autoheuristic_utils import (
+    AHContext,
+    AHMetadata,
+    Choice,
+)
+from torch._inductor.autoheuristic.learnedheuristic_interface import (
+    LearnedHeuristicDecision,
+)
+class MMRankingA100(LearnedHeuristicDecision):
+    def __init__(self) -> None:
+        self.choices: List[Choice] = []
+        self.fill_choices()
+    def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
+        return (
+            metadata.name == self.get_name()
+            and metadata.shared_memory == 166912
+            and str(metadata.device_capa) == "(8, 0)"
+        )
+    def get_confidence_threshold(self) -> float:
+        return 0.0
+    def get_choice(self, idx: int) -> Optional[str]:
+        if idx < len(self.choices):
+            return self.choices[idx]
+        return None
+    def fill_choices(self) -> None:
+        self.choices.append('extern_mm')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=128_BLOCK-N=16_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=128_BLOCK-N=32_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=128_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=16_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=16_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=16_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=32_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=32_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=32_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=64_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=16_numstages=5_numwarps=1')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=32_numstages=1_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=32_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=32_numstages=3_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=32_numstages=4_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=32_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=64_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=16_numstages=5_numwarps=1')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=32_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=32_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=16_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=16_numstages=1_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=16_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=16_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=32_numstages=1_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=64_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=32_BLOCK-N=16_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=32_BLOCK-N=16_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=32_BLOCK-N=64_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=64_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=128_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=16_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=128_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=16_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=32_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=64_numstages=5_numwarps=4')
+    def get_name(self) -> str:
+        return 'mm'
+    def get_best_choices(self, context: AHContext) -> Optional[List[tuple[float, int]]]:
+        if context.get_value('arith_intensity') <= 52.6245059967041:
+            if context.get_value('n') <= 34.0:
+                if context.get_value('n') <= 18.0:
+                    if context.get_value('k*n') <= 312.0:
+                        return [(0.093, 12), (0.081, 16), (0.081, 148), (0.070, 10), (0.070, 17), (0.070, 149), (0.070, 151), (0.070, 150), (0.070, 14), (0.058, 11), (0.058, 15), (0.058, 13), (0.058, 122), (0.047, 121), (0.035, 123), (0.012, 92)]
+                    else:
+                        if context.get_value('k') <= 40.0:
+                            return [(0.083, 42), (0.083, 46), (0.083, 44), (0.083, 40), (0.083, 128), (0.067, 45), (0.067, 43), (0.067, 41), (0.067, 169), (0.067, 171), (0.067, 168), (0.067, 129), (0.067, 170), (0.033, 103), (0.017, 121)]
+                        else:
+                            return [(0.112, 137), (0.104, 136), (0.101, 0), (0.081, 1), (0.073, 135), (0.069, 67), (0.066, 187), (0.058, 41), (0.050, 71), (0.046, 68), (0.046, 70), (0.031, 44), (0.027, 43), (0.027, 170), (0.019, 189), (0.019, 188), (0.015, 169), (0.015, 171), (0.012, 115), (0.012, 168), (0.012, 69), (0.004, 103)]
+                else:
+                    if context.get_value('mat1_stride_0') <= 20.0:
+                        return [(0.069, 0), (0.059, 157), (0.059, 22), (0.059, 153), (0.059, 155), (0.059, 25), (0.059, 23), (0.059, 19), (0.044, 21), (0.044, 18), (0.044, 152), (0.044, 158), (0.044, 154), (0.044, 156), (0.044, 20), (0.044, 124), (0.044, 24), (0.030, 125), (0.029, 126), (0.015, 97), (0.015, 95), (0.015, 96), (0.010, 2), (0.010, 75)]
+                    else:
+                        if context.get_value('k') <= 68.0:
+                            return [(0.087, 72), (0.087, 74), (0.087, 73), (0.086, 76), (0.077, 75), (0.067, 192), (0.058, 190), (0.048, 47), (0.048, 193), (0.048, 49), (0.048, 51), (0.048, 191), (0.038, 53), (0.019, 133), (0.019, 50), (0.019, 175), (0.019, 172), (0.019, 48), (0.019, 174), (0.010, 173), (0.010, 177), (0.010, 52), (0.010, 54), (0.010, 178), (0.010, 176)]
+                        else:
+                            return [(0.154, 52), (0.154, 72), (0.102, 75), (0.087, 49), (0.087, 73), (0.086, 51), (0.057, 176), (0.045, 2), (0.038, 191), (0.038, 178), (0.038, 190), (0.029, 173), (0.029, 76), (0.026, 138), (0.013, 139), (0.013, 140), (0.003, 0)]
+            else:
+                if context.get_value('k') <= 35.0:
+                    if context.get_value('k') <= 18.0:
+                        if context.get_value('m*n') <= 19505152.0:
+                            return [(0.151, 159), (0.140, 160), (0.129, 164), (0.055, 127), (0.051, 29), (0.044, 161), (0.044, 147), (0.040, 146), (0.040, 31), (0.037, 145), (0.026, 28), (0.022, 90), (0.022, 93), (0.022, 94), (0.022, 100), (0.022, 125), (0.022, 158), (0.022, 157), (0.011, 87), (0.011, 88), (0.011, 89), (0.011, 91), (0.011, 95), (0.011, 96), (0.011, 98), (0.011, 99)]
+                        else:
+                            return [(0.069, 7), (0.069, 5), (0.067, 147), (0.066, 8), (0.061, 145), (0.058, 146), (0.052, 124), (0.049, 29), (0.049, 159), (0.046, 31), (0.043, 157), (0.041, 9), (0.041, 4), (0.040, 6), (0.035, 164), (0.035, 160), (0.026, 158), (0.017, 125), (0.017, 28), (0.017, 32), (0.017, 162), (0.017, 27), (0.017, 30), (0.017, 161), (0.009, 33), (0.009, 26), (0.009, 163), (0.006, 0)]
+                    else:
+                        if context.get_value('n') <= 68.0:
+                            return [(0.101, 182), (0.101, 59), (0.088, 57), (0.076, 184), (0.076, 61), (0.076, 179), (0.076, 62), (0.076, 58), (0.063, 180), (0.063, 60), (0.051, 56), (0.050, 181), (0.025, 130), (0.025, 177), (0.025, 183), (0.013, 178), (0.013, 55)]
+                        else:
+                            return [(0.089, 180), (0.079, 60), (0.066, 35), (0.066, 181), (0.066, 38), (0.066, 58), (0.066, 179), (0.066, 57), (0.062, 184), (0.053, 37), (0.044, 166), (0.040, 55), (0.040, 39), (0.040, 36), (0.040, 165), (0.040, 167), (0.027, 177), (0.027, 34), (0.022, 159)]
+                else:
+                    if context.get_value('m*n') <= 309760.0:
+                        return [(0.298, 0), (0.097, 140), (0.080, 83), (0.072, 86), (0.044, 84), (0.036, 178), (0.036, 117), (0.036, 82), (0.032, 120), (0.032, 85), (0.028, 119), (0.024, 130), (0.024, 109), (0.020, 108), (0.020, 118), (0.012, 104), (0.012, 116), (0.012, 141), (0.012, 144), (0.008, 105), (0.008, 106), (0.008, 111), (0.008, 114), (0.008, 107), (0.008, 132), (0.004, 101), (0.004, 102), (0.004, 110), (0.004, 112), (0.004, 113), (0.004, 131)]
+                    else:
+                        if context.get_value('n') <= 72.0:
+                            return [(0.227, 77), (0.118, 78), (0.102, 194), (0.086, 80), (0.059, 57), (0.054, 81), (0.049, 196), (0.048, 197), (0.048, 59), (0.043, 79), (0.032, 195), (0.027, 180), (0.022, 3), (0.021, 141), (0.016, 60), (0.016, 142), (0.011, 183), (0.011, 0), (0.011, 144)]
+                        else:
+                            return [(0.140, 186), (0.132, 185), (0.109, 63), (0.085, 65), (0.078, 37), (0.077, 35), (0.062, 197), (0.047, 194), (0.046, 165), (0.046, 57), (0.039, 78), (0.039, 79), (0.039, 66), (0.039, 64), (0.016, 195), (0.008, 159)]
+        else:
+            if str(context.get_value('using_tf32')) != 'False':
+                if context.get_value('m*n') <= 815360.0:
+                    if context.get_value('k') <= 1184.0:
+                        return [(0.218, 140), (0.205, 0), (0.154, 144), (0.115, 141), (0.051, 185), (0.051, 104), (0.039, 78), (0.038, 116), (0.026, 165), (0.026, 130), (0.026, 178), (0.013, 57), (0.013, 195), (0.013, 167), (0.013, 186)]
+                    else:
+                        return [(0.901, 0), (0.030, 144), (0.030, 134), (0.016, 3), (0.006, 78), (0.006, 77), (0.002, 57), (0.002, 194), (0.002, 59), (0.002, 60), (0.002, 143)]
+                else:
+                    if context.get_value('arith_intensity') <= 187.23922729492188:
+                        if context.get_value('mat1_stride_0') <= 198.0:
+                            return [(0.273, 63), (0.158, 37), (0.152, 35), (0.127, 57), (0.097, 165), (0.053, 185), (0.031, 0), (0.028, 64), (0.014, 60), (0.014, 78), (0.009, 55), (0.008, 134), (0.005, 34), (0.005, 167), (0.005, 179), (0.005, 65), (0.005, 66), (0.005, 186), (0.005, 194), (0.002, 166)]
+                        else:
+                            return [(0.296, 63), (0.235, 0), (0.132, 64), (0.074, 37), (0.069, 78), (0.051, 185), (0.051, 35), (0.030, 57), (0.020, 77), (0.016, 194), (0.008, 66), (0.007, 65), (0.003, 3), (0.003, 165), (0.003, 141), (0.001, 134), (0.001, 166)]
+                    else:
+                        return [(0.405, 0), (0.246, 37), (0.177, 63), (0.145, 35), (0.005, 185), (0.005, 65), (0.005, 64), (0.004, 57), (0.003, 66), (0.002, 165), (0.001, 78), (0.001, 55)]
+            else:
+                return [(0.357, 0), (0.112, 165), (0.101, 57), (0.094, 179), (0.086, 64), (0.074, 167), (0.067, 60), (0.064, 159), (0.033, 35), (0.007, 195), (0.002, 180), (0.001, 34), (0.001, 166), (0.001, 78)]

.venv/lib/python3.12/site-packages/torch/_inductor/autoheuristic/artifacts/_MMRankingH100.py ADDED Viewed

	@@ -0,0 +1,321 @@

+# flake8: noqa: B950
+# fmt: off
+# This file was generated by AutoHeuristic. Do not modify it manually!
+# To regenerate this file, take a look at the steps in the README.md file inside torchgen/_autoheuristic/mm/
+from typing import List, Optional, Tuple
+from torch._inductor.autoheuristic.autoheuristic_utils import (
+    AHContext,
+    AHMetadata,
+    Choice,
+)
+from torch._inductor.autoheuristic.learnedheuristic_interface import (
+    LearnedHeuristicDecision,
+)
+class MMRankingH100(LearnedHeuristicDecision):
+    def __init__(self) -> None:
+        self.choices: List[Choice] = []
+        self.fill_choices()
+    def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
+        return (
+            metadata.name == self.get_name()
+            and metadata.shared_memory == 232448
+            and str(metadata.device_capa) == "(9, 0)"
+        )
+    def get_confidence_threshold(self) -> float:
+        return 0.0
+    def get_choice(self, idx: int) -> Optional[str]:
+        if idx < len(self.choices):
+            return self.choices[idx]
+        return None
+    def fill_choices(self) -> None:
+        self.choices.append('extern_mm')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=128_BLOCK-N=16_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=128_BLOCK-N=32_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=128_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=16_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=16_BLOCK-N=64_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=16_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=32_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=16_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=16_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=16_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=32_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=32_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=32_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=64_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=32_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=16_numstages=3_numwarps=1')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=16_numstages=4_numwarps=1')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=16_numstages=5_numwarps=1')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=32_numstages=1_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=32_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=32_numstages=3_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=32_numstages=4_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=32_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=64_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=64_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=16_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=128_numstages=2_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=128_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=16_numstages=4_numwarps=1')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=16_numstages=5_numwarps=1')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=32_numstages=4_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=32_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=64_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=32_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=16_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=16_numstages=1_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=16_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=16_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=32_numstages=1_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=16_BLOCK-N=64_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=32_BLOCK-N=16_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=32_BLOCK-N=16_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=32_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=32_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=32_BLOCK-N=64_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=64_BLOCK-N=16_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=64_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=128_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=16_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=32_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=16_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=128_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=16_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=32_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=16_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=16_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=16_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=32_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=32_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=32_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=64_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=64_numstages=5_numwarps=4')
+    def get_name(self) -> str:
+        return 'mm'
+    def get_best_choices(self, context: AHContext) -> Optional[List[tuple[float, int]]]:
+        if context.get_value('arith_intensity') <= 29.89772129058838:
+            if context.get_value('n') <= 34.0:
+                if context.get_value('n') <= 18.0:
+                    if context.get_value('k*n') <= 432.0:
+                        if context.get_value('arith_intensity') <= 7.8700292110443115:
+                            return [(0.098, 128), (0.098, 129), (0.098, 127), (0.073, 14), (0.073, 16), (0.073, 12), (0.073, 154), (0.073, 156), (0.073, 157), (0.073, 155), (0.049, 10), (0.049, 94), (0.049, 95), (0.048, 96)]
+                        else:
+                            return [(0.091, 154), (0.073, 10), (0.073, 15), (0.073, 13), (0.073, 11), (0.073, 17), (0.073, 16), (0.073, 14), (0.073, 12), (0.055, 127), (0.054, 157), (0.054, 156), (0.054, 155), (0.036, 129), (0.036, 128), (0.018, 41), (0.018, 43)]
+                    else:
+                        if context.get_value('k') <= 40.0:
+                            return [(0.070, 39), (0.069, 45), (0.069, 41), (0.069, 43), (0.069, 111), (0.069, 112), (0.056, 38), (0.056, 40), (0.056, 42), (0.056, 44), (0.056, 174), (0.056, 173), (0.056, 175), (0.056, 134), (0.056, 172), (0.056, 135), (0.014, 154), (0.014, 127)]
+                        else:
+                            return [(0.147, 144), (0.119, 143), (0.087, 142), (0.083, 0), (0.073, 191), (0.059, 69), (0.050, 67), (0.046, 70), (0.041, 1), (0.036, 174), (0.032, 43), (0.032, 123), (0.028, 40), (0.027, 42), (0.027, 173), (0.023, 175), (0.018, 66), (0.014, 192), (0.014, 193), (0.014, 139), (0.014, 68), (0.014, 127)]
+                else:
+                    if context.get_value('mat1_stride_0') <= 40.0:
+                        if context.get_value('mat1_stride_0') <= 20.0:
+                            return [(0.109, 23), (0.109, 21), (0.109, 20), (0.088, 0), (0.087, 131), (0.066, 18), (0.065, 130), (0.065, 132), (0.065, 159), (0.065, 160), (0.065, 161), (0.065, 158), (0.022, 22), (0.022, 19)]
+                        else:
+                            return [(0.065, 46), (0.064, 52), (0.064, 50), (0.064, 48), (0.064, 51), (0.064, 49), (0.064, 47), (0.064, 53), (0.064, 181), (0.064, 177), (0.064, 179), (0.064, 176), (0.038, 130), (0.038, 136), (0.026, 182), (0.026, 178), (0.026, 180), (0.026, 137), (0.025, 158), (0.013, 114), (0.013, 113)]
+                    else:
+                        if context.get_value('mat1_stride_0') <= 68.0:
+                            return [(0.138, 140), (0.125, 195), (0.100, 71), (0.100, 74), (0.100, 196), (0.100, 194), (0.100, 197), (0.075, 75), (0.062, 72), (0.062, 73), (0.012, 180), (0.012, 51), (0.012, 182)]
+                        else:
+                            return [(0.124, 180), (0.124, 182), (0.114, 75), (0.103, 74), (0.093, 51), (0.093, 71), (0.072, 72), (0.062, 194), (0.052, 145), (0.052, 195), (0.021, 48), (0.021, 50), (0.021, 47), (0.020, 124), (0.010, 147), (0.010, 146), (0.010, 46)]
+            else:
+                if context.get_value('k') <= 18.0:
+                    if context.get_value('m*k') <= 528.0:
+                        return [(0.097, 88), (0.087, 92), (0.077, 90), (0.058, 105), (0.058, 103), (0.058, 104), (0.058, 99), (0.058, 100), (0.058, 106), (0.058, 93), (0.057, 91), (0.057, 97), (0.057, 98), (0.057, 101), (0.048, 102), (0.029, 87), (0.029, 89)]
+                    else:
+                        if context.get_value('n') <= 80.0:
+                            return [(0.057, 161), (0.057, 130), (0.057, 24), (0.056, 164), (0.056, 163), (0.056, 166), (0.056, 168), (0.056, 30), (0.056, 28), (0.056, 26), (0.056, 25), (0.056, 27), (0.056, 29), (0.056, 31), (0.042, 131), (0.028, 99), (0.028, 101), (0.028, 100), (0.028, 167), (0.028, 165), (0.028, 133)]
+                        else:
+                            return [(0.110, 164), (0.108, 163), (0.106, 168), (0.069, 161), (0.066, 151), (0.060, 152), (0.055, 165), (0.050, 27), (0.050, 29), (0.048, 131), (0.043, 153), (0.037, 133), (0.037, 130), (0.028, 8), (0.028, 5), (0.027, 7), (0.026, 26), (0.016, 162), (0.012, 9), (0.007, 4), (0.005, 100), (0.005, 6), (0.005, 24)]
+                else:
+                    if context.get_value('k') <= 36.0:
+                        if context.get_value('n') <= 68.0:
+                            return [(0.097, 184), (0.097, 56), (0.086, 186), (0.086, 183), (0.086, 188), (0.086, 58), (0.086, 60), (0.065, 54), (0.043, 187), (0.043, 185), (0.043, 57), (0.043, 61), (0.032, 55), (0.032, 130), (0.032, 59), (0.011, 181), (0.011, 163), (0.011, 136), (0.011, 138)]
+                        else:
+                            return [(0.117, 184), (0.117, 170), (0.117, 169), (0.107, 183), (0.106, 188), (0.075, 181), (0.064, 130), (0.064, 56), (0.053, 171), (0.032, 57), (0.032, 59), (0.032, 185), (0.011, 163), (0.011, 32), (0.011, 37), (0.011, 34), (0.011, 33), (0.011, 35), (0.011, 36), (0.011, 54)]
+                    else:
+                        if context.get_value('mat2_stride_0') <= 384.0:
+                            return [(0.244, 0), (0.061, 76), (0.061, 79), (0.030, 3), (0.030, 183), (0.030, 189), (0.030, 187), (0.030, 64), (0.030, 190), (0.030, 62), (0.030, 198), (0.030, 201), (0.030, 77), (0.030, 200), (0.030, 80), (0.030, 199), (0.030, 78), (0.030, 184), (0.020, 86), (0.020, 84), (0.020, 120), (0.020, 81), (0.020, 121), (0.020, 85), (0.020, 122), (0.010, 83), (0.010, 118), (0.010, 119), (0.010, 82)]
+                        else:
+                            return [(0.274, 83), (0.171, 86), (0.152, 0), (0.071, 85), (0.061, 125), (0.050, 84), (0.020, 109), (0.020, 117), (0.020, 81), (0.020, 118), (0.020, 121), (0.020, 108), (0.020, 115), (0.020, 116), (0.010, 110), (0.010, 120), (0.010, 103), (0.010, 107), (0.010, 119), (0.010, 122)]
+        else:
+            if context.get_value('arith_intensity') <= 56.995582580566406:
+                if context.get_value('n') <= 68.0:
+                    if context.get_value('k*n') <= 4448.0:
+                        if context.get_value('m*n') <= 29626368.0:
+                            return [(0.107, 198), (0.107, 200), (0.107, 201), (0.107, 199), (0.106, 76), (0.106, 79), (0.064, 197), (0.063, 56), (0.043, 184), (0.043, 187), (0.042, 80), (0.042, 77), (0.042, 183), (0.021, 78)]
+                        else:
+                            return [(0.073, 201), (0.073, 198), (0.073, 200), (0.073, 199), (0.073, 197), (0.073, 56), (0.073, 58), (0.073, 79), (0.073, 76), (0.072, 59), (0.072, 78), (0.072, 77), (0.072, 80), (0.018, 184), (0.018, 55), (0.018, 54)]
+                    else:
+                        if context.get_value('k') <= 348.0:
+                            return [(0.206, 76), (0.183, 77), (0.169, 198), (0.160, 199), (0.053, 59), (0.046, 56), (0.038, 3), (0.030, 148), (0.030, 58), (0.030, 187), (0.023, 184), (0.015, 0), (0.008, 55), (0.008, 54)]
+                        else:
+                            return [(0.146, 198), (0.145, 199), (0.145, 148), (0.126, 0), (0.084, 76), (0.084, 77), (0.042, 80), (0.042, 79), (0.021, 149), (0.021, 150), (0.021, 3), (0.014, 46), (0.014, 74), (0.014, 75), (0.014, 124), (0.014, 194), (0.014, 195), (0.007, 145), (0.007, 146), (0.007, 2), (0.007, 72), (0.007, 147), (0.007, 71)]
+                else:
+                    if context.get_value('m') <= 3264.0:
+                        return [(0.247, 147), (0.115, 197), (0.066, 199), (0.066, 201), (0.066, 198), (0.049, 0), (0.049, 169), (0.049, 171), (0.033, 140), (0.033, 125), (0.033, 114), (0.016, 126), (0.016, 183), (0.016, 184), (0.016, 185), (0.016, 182), (0.016, 188), (0.016, 78), (0.016, 148), (0.016, 138), (0.016, 77), (0.016, 56), (0.016, 59)]
+                    else:
+                        if context.get_value('k') <= 62.5:
+                            return [(0.226, 190), (0.226, 189), (0.122, 62), (0.122, 64), (0.055, 77), (0.055, 78), (0.037, 198), (0.036, 201), (0.036, 33), (0.024, 163), (0.018, 56), (0.018, 35), (0.018, 169), (0.006, 171)]
+                        else:
+                            return [(0.162, 35), (0.118, 33), (0.096, 189), (0.096, 190), (0.088, 169), (0.074, 62), (0.073, 56), (0.066, 171), (0.051, 198), (0.051, 201), (0.044, 59), (0.037, 64), (0.029, 63), (0.007, 0), (0.007, 77)]
+            else:
+                if context.get_value('m*n') <= 1097728.0:
+                    return [(0.403, 0), (0.179, 141), (0.134, 150), (0.086, 147), (0.051, 148), (0.048, 3), (0.024, 189), (0.020, 199), (0.017, 64), (0.010, 65), (0.010, 77), (0.007, 114), (0.003, 138), (0.003, 59), (0.003, 182)]
+                else:
+                    if context.get_value('m*n') <= 3244032.0:
+                        return [(0.295, 189), (0.176, 64), (0.157, 65), (0.090, 0), (0.069, 62), (0.059, 63), (0.046, 77), (0.039, 169), (0.023, 199), (0.020, 35), (0.013, 33), (0.010, 171), (0.003, 141)]
+                    else:
+                        if context.get_value('n') <= 136.0:
+                            return [(0.197, 189), (0.197, 63), (0.161, 77), (0.157, 62), (0.061, 33), (0.044, 65), (0.039, 35), (0.039, 64), (0.030, 169), (0.026, 0), (0.017, 199), (0.017, 148), (0.009, 56), (0.004, 3)]
+                        else:
+                            return [(0.460, 0), (0.145, 62), (0.138, 63), (0.081, 35), (0.047, 33), (0.043, 189), (0.023, 64), (0.018, 77), (0.013, 169), (0.009, 65), (0.009, 56), (0.005, 32), (0.005, 59), (0.002, 183), (0.002, 163)]

.venv/lib/python3.12/site-packages/torch/_inductor/autoheuristic/artifacts/_MixedMMA100.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# flake8: noqa: B950
+# fmt: off
+# This file was generated by AutoHeuristic. Do not modify it manually!
+# To regenerate this file, take a look at the steps in the README.md file inside torchgen/_autoheuristic/mixed_mm/
+from typing import List, Optional, Tuple
+from torch._inductor.autoheuristic.autoheuristic_utils import (
+    AHContext,
+    AHMetadata,
+    Choice,
+)
+from torch._inductor.autoheuristic.learnedheuristic_interface import (
+    LearnedHeuristicDecision,
+)
+class MixedMMA100(LearnedHeuristicDecision):
+    def __init__(self) -> None:
+        self.choices: List[Choice] = []
+        self.fill_choices()
+    def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
+        return (
+            metadata.name == self.get_name()
+            and metadata.shared_memory == 166912
+            and str(metadata.device_capa) == "(8, 0)"
+        )
+    def get_confidence_threshold(self) -> float:
+        return 0.0
+    def get_choice(self, idx: int) -> Optional[str]:
+        if idx < len(self.choices):
+            return self.choices[idx]
+        return None
+    def fill_choices(self) -> None:
+        self.choices.append('extern_fallback_mixed_mm')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=32_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=256_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=256_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=128_numstages=4_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=8')
+    def get_name(self) -> str:
+        return 'mixed_mm'
+    def get_best_choices(self, context: AHContext) -> Optional[List[tuple[float, int]]]:
+        if str(context.get_value('1LEQmLEQ16')) != 'True':
+            if context.get_value('m') <= 32.5:
+                if context.get_value('n') <= 6976.0:
+                    if context.get_value('n') <= 3520.0:
+                        if context.get_value('m*n') <= 37632.0:
+                            return None
+                        else:
+                            return [(1.000, 13)]
+                    else:
+                        if context.get_value('m*k') <= 452352.0:
+                            return [(0.590, 13), (0.256, 8), (0.103, 7), (0.051, 11)]
+                        else:
+                            return [(0.778, 8), (0.222, 13)]
+                else:
+                    if context.get_value('k*n') <= 102776832.0:
+                        if context.get_value('n') <= 14656.0:
+                            return [(1.000, 11)]
+                        else:
+                            return [(0.889, 11), (0.111, 13)]
+                    else:
+                        return [(1.000, 11)]
+            else:
+                if context.get_value('m*n') <= 446464.0:
+                    if context.get_value('m*n') <= 223424.0:
+                        if context.get_value('mat1_stride_0') <= 3968.0:
+                            return None
+                        else:
+                            return None
+                    else:
+                        if context.get_value('m*n') <= 346112.0:
+                            return [(0.960, 16), (0.040, 7)]
+                        else:
+                            return [(0.750, 16), (0.136, 14), (0.114, 7)]
+                else:
+                    if str(context.get_value('33LEQmLEQ64')) != 'True':
+                        if context.get_value('n') <= 6976.0:
+                            return [(1.000, 14)]
+                        else:
+                            return [(0.753, 2), (0.222, 1), (0.015, 7), (0.007, 16), (0.004, 12)]
+                    else:
+                        if context.get_value('n') <= 13888.0:
+                            return [(0.710, 14), (0.275, 21), (0.014, 12)]
+                        else:
+                            return [(0.374, 19), (0.339, 20), (0.106, 21), (0.101, 16), (0.066, 17), (0.009, 14), (0.004, 18)]
+        else:
+            if context.get_value('n') <= 3520.0:
+                if context.get_value('arith_intensity') <= 3.994754433631897:
+                    if str(context.get_value('mat2_dtype')) != 'torch.uint8':
+                        if context.get_value('m*k') <= 18944.0:
+                            return [(0.577, 5), (0.423, 6)]
+                        else:
+                            return [(0.988, 5), (0.012, 6)]
+                    else:
+                        if context.get_value('arith_intensity') <= 2.9899919033050537:
+                            return None
+                        else:
+                            return None
+                else:
+                    if context.get_value('arith_intensity') <= 7.956453561782837:
+                        if context.get_value('k*n') <= 9244032.0:
+                            return [(0.822, 5), (0.178, 6)]
+                        else:
+                            return [(0.977, 5), (0.023, 0)]
+                    else:
+                        if context.get_value('m*k') <= 978944.0:
+                            return [(1.000, 5)]
+                        else:
+                            return [(0.971, 5), (0.029, 0)]
+            else:
+                if context.get_value('n') <= 13632.0:
+                    if context.get_value('n') <= 6976.0:
+                        return [(1.000, 6)]
+                    else:
+                        if context.get_value('k') <= 3968.0:
+                            return [(0.617, 3), (0.111, 5), (0.099, 7), (0.086, 9), (0.062, 6), (0.025, 8)]
+                        else:
+                            return [(0.779, 8), (0.119, 5), (0.053, 7), (0.035, 6), (0.013, 3)]
+                else:
+                    if context.get_value('k*n') <= 39518208.0:
+                        return [(0.385, 4), (0.327, 3), (0.192, 6), (0.038, 7), (0.038, 10), (0.019, 5)]
+                    else:
+                        if context.get_value('n') <= 20800.0:
+                            return [(0.821, 6), (0.121, 7), (0.029, 4), (0.014, 5), (0.007, 3), (0.007, 8)]
+                        else:
+                            return [(0.530, 7), (0.386, 6), (0.046, 8), (0.021, 3), (0.015, 4), (0.002, 5)]

.venv/lib/python3.12/site-packages/torch/_inductor/autoheuristic/artifacts/_MixedMMH100.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# flake8: noqa: B950
+# fmt: off
+# This file was generated by AutoHeuristic. Do not modify it manually!
+# To regenerate this file, take a look at the steps in the README.md file inside torchgen/_autoheuristic/mixed_mm/
+from typing import List, Optional, Tuple
+from torch._inductor.autoheuristic.autoheuristic_utils import (
+    AHContext,
+    AHMetadata,
+    Choice,
+)
+from torch._inductor.autoheuristic.learnedheuristic_interface import (
+    LearnedHeuristicDecision,
+)
+class MixedMMH100(LearnedHeuristicDecision):
+    def __init__(self) -> None:
+        self.choices: List[Choice] = []
+        self.fill_choices()
+    def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
+        return (
+            metadata.name == self.get_name()
+            and metadata.shared_memory == 232448
+            and str(metadata.device_capa) == "(9, 0)"
+        )
+    def get_confidence_threshold(self) -> float:
+        return 0.0
+    def get_choice(self, idx: int) -> Optional[str]:
+        if idx < len(self.choices):
+            return self.choices[idx]
+        return None
+    def fill_choices(self) -> None:
+        self.choices.append('extern_fallback_mixed_mm')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=32_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=128_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=32_numstages=2_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=2')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=256_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=256_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=128_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=16_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=32_numstages=2_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=32_BLOCK-K=32_BLOCK-N=64_numstages=5_numwarps=8')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=128_numstages=4_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=32_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=128_BLOCK-N=64_numstages=5_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=128_numstages=3_numwarps=4')
+        self.choices.append('type=triton_BLOCK-M=64_BLOCK-K=64_BLOCK-N=64_numstages=3_numwarps=8')
+    def get_name(self) -> str:
+        return 'mixed_mm'
+    def get_best_choices(self, context: AHContext) -> Optional[List[tuple[float, int]]]:
+        if context.get_value('arith_intensity') <= 15.988086223602295:
+            if context.get_value('n') <= 25280.0:
+                if context.get_value('n') <= 1344.0:
+                    if context.get_value('mat1_stride_0') <= 7808.0:
+                        return [(0.581, 7), (0.419, 6)]
+                    else:
+                        if context.get_value('m*n') <= 7680.0:
+                            return [(0.875, 0), (0.125, 6)]
+                        else:
+                            return [(0.833, 0), (0.167, 7)]
+                else:
+                    if context.get_value('n') <= 8512.0:
+                        if str(context.get_value('mat2_dtype')) != 'torch.int8':
+                            return [(0.763, 6), (0.237, 7)]
+                        else:
+                            return [(0.725, 7), (0.275, 6)]
+                    else:
+                        if str(context.get_value('mat1_dtype')) != 'torch.bfloat16':
+                            return [(0.736, 7), (0.197, 9), (0.048, 6), (0.014, 8), (0.005, 10)]
+                        else:
+                            return [(0.473, 7), (0.398, 6), (0.097, 9), (0.032, 10)]
+            else:
+                if context.get_value('n') <= 42254.0:
+                    if context.get_value('n') <= 33856.0:
+                        if context.get_value('k*n') <= 68157440.0:
+                            return [(0.370, 4), (0.370, 5), (0.074, 7), (0.074, 8), (0.074, 11), (0.037, 6)]
+                        else:
+                            return [(0.916, 8), (0.036, 7), (0.036, 9), (0.012, 4)]
+                    else:
+                        return [(0.659, 5), (0.341, 6)]
+                else:
+                    if context.get_value('k*n') <= 326052992.0:
+                        if context.get_value('n') <= 55232.0:
+                            return [(0.571, 6), (0.321, 7), (0.036, 4), (0.036, 8), (0.036, 9)]
+                        else:
+                            return [(0.506, 6), (0.325, 8), (0.104, 7), (0.039, 5), (0.026, 9)]
+                    else:
+                        if context.get_value('n') <= 57024.0:
+                            return [(0.462, 9), (0.385, 7), (0.115, 6), (0.038, 8)]
+                        else:
+                            return [(0.598, 8), (0.223, 9), (0.107, 6), (0.071, 7)]
+        else:
+            if context.get_value('m*n') <= 543936.0:
+                if str(context.get_value('17LEQmLEQ32')) != 'True':
+                    if context.get_value('m*n') <= 262272.0:
+                        if context.get_value('n') <= 1592.5:
+                            return [(0.860, 0), (0.140, 9)]
+                        else:
+                            return None
+                    else:
+                        if context.get_value('m*k') <= 1294336.0:
+                            return [(0.833, 17), (0.150, 18), (0.017, 15)]
+                        else:
+                            return [(0.917, 17), (0.083, 8)]
+                else:
+                    if context.get_value('n') <= 12416.0:
+                        if context.get_value('m*n') <= 43008.0:
+                            return None
+                        else:
+                            return [(0.853, 14), (0.147, 9)]
+                    else:
+                        return [(0.625, 12), (0.375, 14)]
+            else:
+                if context.get_value('m') <= 32.5:
+                    if context.get_value('mat2_stride_1') <= 6656.0:
+                        if context.get_value('n') <= 69184.0:
+                            return [(0.611, 12), (0.361, 14), (0.028, 13)]
+                        else:
+                            return [(1.000, 12)]
+                    else:
+                        if context.get_value('mat2_stride_1') <= 20864.0:
+                            return [(1.000, 12)]
+                        else:
+                            return [(0.958, 12), (0.042, 9)]
+                else:
+                    if context.get_value('m*n') <= 1085440.0:
+                        if context.get_value('n') <= 9152.0:
+                            return [(1.000, 18)]
+                        else:
+                            return [(0.780, 18), (0.160, 16), (0.060, 20)]
+                    else:
+                        if context.get_value('m') <= 67.0:
+                            return [(0.650, 16), (0.203, 19), (0.122, 18), (0.016, 20), (0.008, 1)]
+                        else:
+                            return [(0.561, 3), (0.185, 16), (0.096, 20), (0.083, 19), (0.076, 2)]

.venv/lib/python3.12/site-packages/torch/_inductor/autoheuristic/artifacts/_PadMMA100.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# flake8: noqa: B950
+# fmt: off
+# This file was generated by AutoHeuristic. Do not modify it manually!
+# To regenerate this file, take a look at the steps in the README.md file inside torchgen/_autoheuristic/pad_mm/
+from torch._inductor.autoheuristic.autoheuristic_utils import AHContext, AHMetadata, Choice, CHOICE_COL
+from torch._inductor.autoheuristic.learnedheuristic_interface import (
+    LearnedHeuristicRegression,
+)
+class PadMMA100(LearnedHeuristicRegression):
+    def __init__(self) -> None:
+        pass
+    def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
+        return (
+            metadata.name == self.get_name()
+            and metadata.shared_memory == 166912
+            and str(metadata.device_capa) == "(8, 0)"
+        )
+    def get_feedback(self, context: AHContext, choice: Choice) -> float:
+        context.context_dict[CHOICE_COL] = choice
+        return self.predict(context)
+    def get_confidence_threshold(self) -> float:
+        return 1.7025303314066
+    def get_name(self) -> str:
+        return 'pad_mm'
+    def predict(self, context: AHContext) -> float:
+        if str(context.get_value('choice')) != 'pad':
+            if str(context.get_value('using_tf32')) != 'False':
+                if context.get_value('m*n') <= 4171264.0:
+                    if context.get_value('m*k') <= 3999308.0:
+                        return 1.8751469764071178
+                    else:
+                        if str(context.get_value('n_multiple_32')) != 'True':
+                            return 0.9117231355626345
+                        else:
+                            return 1.1607689608873861
+                else:
+                    if str(context.get_value('n_multiple_2')) != 'True':
+                        if str(context.get_value('using_tf32')) != 'True':
+                            return 0.7430382200435992
+                        else:
+                            return 0.8531269794448678
+                    else:
+                        if str(context.get_value('k_multiple_2')) != 'True':
+                            return 0.7577181972719917
+                        else:
+                            return 0.8977349440424219
+            else:
+                if context.get_value('m*n') <= 1299712.0:
+                    return 1.1669723418995592
+                else:
+                    if context.get_value('mat2_stride_1') <= 45217.5:
+                        if context.get_value('m*n') <= 55884158.0:
+                            return 1.0262769936909601
+                        else:
+                            return 1.0022677428470845
+                    else:
+                        if context.get_value('m') <= 18478.0:
+                            return 1.1127066261894312
+                        else:
+                            return 1.0337740659894263
+        else:
+            if str(context.get_value('mat1_dtype')) != 'torch.float32':
+                if str(context.get_value('n_multiple_2')) != 'False':
+                    if str(context.get_value('k_multiple_2')) != 'True':
+                        if context.get_value('mat1_stride_0') <= 561.0:
+                            return 1.2900382135142956
+                        else:
+                            return 1.5761737616057887
+                    else:
+                        if context.get_value('num_dims_needs_padding') <= 1.5:
+                            return 1.0472263310239422
+                        else:
+                            return 1.1727673465762514
+                else:
+                    if context.get_value('k') <= 28238.5:
+                        if context.get_value('k/(m*n)') <= 0.00026227018679492176:
+                            return 1.6770542505397175
+                        else:
+                            return 1.3974785435105923
+                    else:
+                        if str(context.get_value('mat1_dtype')) != 'torch.bfloat16':
+                            return 1.3952699800111992
+                        else:
+                            return 1.5759286511628336
+            else:
+                if str(context.get_value('using_tf32')) != 'False':
+                    if context.get_value('m*n') <= 14119424.0:
+                        return 0.8875772670422478
+                    else:
+                        if str(context.get_value('mat2_innermost_needs_padding')) != 'True':
+                            return 1.1467728924377265
+                        else:
+                            return 1.215842963532998
+                else:
+                    if context.get_value('arith_intensity') <= 396.8774871826172:
+                        return 0.89940161869551
+                    else:
+                        if context.get_value('mat2_stride_1') <= 45217.5:
+                            return 0.9964328169353532
+                        else:
+                            return 0.9493479238294826

.venv/lib/python3.12/site-packages/torch/_inductor/autoheuristic/artifacts/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.12/site-packages/torch/_inductor/autoheuristic/autoheuristic.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import json
+import os
+from functools import partial
+from typing import Any, Callable, Optional
+import torch
+from torch._inductor.autoheuristic.autoheuristic_utils import (
+    AHContext,
+    AHMetadata,
+    AHOperation,
+    Choice,
+    CHOICE_COL,
+    Feedback,
+    FEEDBACK_COL,
+    get_metadata_str_from_log,
+)
+from torch._inductor.autoheuristic.learned_heuristic_controller import (
+    LearnedHeuristicController,
+)
+from torch._inductor.ir import ChoiceCaller
+from torch._inductor.runtime.runtime_utils import cache_dir
+from torch._inductor.utils import get_gpu_shared_memory
+class LocalFeedback:
+    """
+    To be able to collect data for a choice, a function providing feedback given a choice has to be provided.
+    LocalFeedback can be used when AutoHeuristic should immediately run the function to collect feedback for each choice
+    (see pad_mm.py, where the autotuning happens locally, for an example).
+    """
+    def __init__(self, feedback_fn: Callable[[Choice], Feedback]) -> None:
+        self.feedback_fn = feedback_fn
+    def __call__(self, choice: Choice) -> Feedback:
+        return self.feedback_fn(choice)
+class InconsistentMetadata(Exception):
+    """
+    Exception that is thrown when AutoHeuristic tries to log data to a file where the metadata stored in the file does
+    not match the metadata it would store if the file didn't exist.
+    """
+class AutoHeuristic:
+    """
+    AutoHeuristic is a framework that allows one to collect data, learn a heuristic (i.e. a regression tree) and
+    generate the heuristic to code. This class allows one to collect data. The collected data can then be used to train
+    a heuristic (see torchgen/autoheuristic/).
+    """
+    collected_feedback: dict[Choice, Feedback]
+    def __init__(
+        self,
+        fallback: Callable[[], Choice],
+        choices: list[Choice],
+        feedback: Optional[LocalFeedback],
+        context: AHContext,
+        name: str,
+        augment_context: Optional[list[AHOperation]] = None,
+        precondition: Optional[Callable[[AHMetadata, AHContext], bool]] = None,
+    ) -> None:
+        """
+        Initializes an instance of the AutoHeuristic class.
+        Args:
+            fallback: A callable that returns a Choice when the heuristic is unsure which choice to make, or
+            AutoHeuristic is in data collection mode.
+            choices: A list of possible choices the heuristic can make.
+            feedback: An instance of LocalFeedback that provides feedback for a given choice.
+            context: Context to store with each choice and feedback.
+            name: A string that identifies the heuristic.
+            augment_context: An optional list of AHOperation instances that augment the context.
+            precondition: A callable that returns a boolean indicating whether AutoHeuristic should run.
+        """
+        self.fallback = fallback
+        self.choices = choices
+        self.feedback = feedback
+        self.context = context
+        self.name = name
+        self.collected_feedback = {}
+        self.augment_context = augment_context
+        self.metadata = AHMetadata(
+            get_gpu_shared_memory(),
+            torch.cuda.get_device_capability(),
+            self.choices,
+            self.name,
+        )
+        self.precondition = precondition
+        if not self.satisfies_precondition():
+            return
+        if torch._inductor.config.autoheuristic_log_path == "DEFAULT":
+            self.log_path = self.get_default_log_path()
+        else:
+            self.log_path = torch._inductor.config.autoheuristic_log_path
+        if torch._inductor.config.collect_autoheuristic(self.name):
+            if self.feedback is not None:
+                for choice in self.choices:
+                    feedback_val = self.feedback(choice)
+                    self.save_data(choice, feedback_val)
+    def satisfies_precondition(self) -> bool:
+        return self.precondition is None or self.precondition(
+            self.metadata, self.context
+        )
+    def get_choice(self) -> Choice:
+        """
+        Returns the chosen option based on the value of autoheuristic_use.
+        If self.name is one of the comma separated strings in autoheuristic_use,
+        it queries a learned heuristic to make a decision. Otherwise, it returns the fallback option.
+        """
+        if not self.satisfies_precondition():
+            return self.fallback()
+        if torch._inductor.config.use_autoheuristic(self.name):
+            if self.augment_context is not None:
+                self.context.apply_operations(self.augment_context)
+            controller = LearnedHeuristicController(
+                self.metadata,
+                self.context,
+            )
+            decision = controller.get_decision()
+            if decision not in self.choices:
+                # TODO(AlnisM): We might want to allow this in the future
+                return self.fallback()
+            if decision is not None:
+                return decision
+        return self.fallback()
+    def get_top_k_choices(
+        self, top_k: int, always_included: Optional[list[str]] = None
+    ) -> Optional[list[Choice]]:
+        if not self.satisfies_precondition():
+            return None
+        if torch._inductor.config.use_autoheuristic(self.name):
+            if self.augment_context is not None:
+                self.context.apply_operations(self.augment_context)
+            controller = LearnedHeuristicController(
+                self.metadata,
+                self.context,
+            )
+            choices = controller.get_decisions_ranked(top_k)
+            if choices is None:
+                return None
+            if always_included is not None:
+                for choice in always_included:
+                    if choice not in choices:
+                        choices.append(choice)
+            return choices
+        return None
+    def get_collected_feedback(self, choice: Choice) -> Any:
+        return self.collected_feedback.get(choice, None)
+    @staticmethod
+    def get_device_identifier() -> str:
+        # a heuristic might work well for one GPU, but not for another
+        # we store the collected data per GPU model and learn a heuristic per GPU model
+        # TODO(AlnisM): just using the device name for now, but the same GPU model can have different names
+        device_name = torch.cuda.get_device_name().replace(" ", "_")
+        return device_name
+    def get_default_log_path(self) -> str:
+        device_name = self.get_device_identifier()
+        path = f"{cache_dir()}/autoheuristic/{device_name}/"
+        os.makedirs(path, exist_ok=True)
+        path += f"{self.name}.txt"
+        return path
+    def serialize_metadata(self) -> str:
+        metadata_dict = self.metadata.to_dict()
+        (
+            num_features,
+            cat_features,
+        ) = self.context.get_numerical_and_categorical_features()
+        metadata_dict["numerical_features"] = num_features
+        metadata_dict["categorical_features"] = cat_features
+        return json.dumps(metadata_dict)
+    def save_data(self, choice: Choice, feedback_val: Feedback) -> None:
+        self.collected_feedback[choice] = feedback_val
+        log_path = self.log_path
+        lines = []
+        log_exists = os.path.exists(log_path)
+        if log_exists:
+            # if log already exists, make sure it is consistent
+            metadata = self.serialize_metadata()
+            existing_metadata = get_metadata_str_from_log(self.log_path)
+            if existing_metadata != metadata:
+                raise InconsistentMetadata(
+                    "Given metadata does not match existing metadata"
+                )
+        else:
+            lines.append(self.serialize_metadata())
+            feature_header = self.context.get_feature_names_csv()
+            header = feature_header + "," + CHOICE_COL + "," + FEEDBACK_COL
+            lines.append(header)
+        line = ""
+        feature_values = self.context.get_feature_values_csv()
+        line += feature_values + "," + choice + "," + str(feedback_val)
+        lines.append(line)
+        with open(log_path, "a") as f:
+            f.write("\n".join(lines) + "\n")
+class AutoHeuristicSelectAlgorithm(AutoHeuristic):
+    """
+    AutoHeuristicSelectAlgorithm is a subclass of AutoHeuristic that allows one to collect data and learn a heuristic
+    when one wants to use AutoHeuristic for kernel choice selection.
+    """
+    def __init__(
+        self,
+        fallback: Callable[[], Optional[ChoiceCaller]],
+        choices: list[ChoiceCaller],
+        input_nodes: list[Any],
+        context: AHContext,
+        name: str,
+        augment_context: Optional[list[AHOperation]] = None,
+        precondition: Optional[Callable[[AHMetadata, AHContext], bool]] = None,
+    ) -> None:
+        """
+        The arguments choices, input_nodes and name have to match the ones used in the call to
+        autotune_select_algorithm(), e.g. if the following call is made
+        autotune_select_algorithm(name, choices, input_nodes, layout), the same name, choices and input_nodes
+        have to be used here.
+        """
+        self.input_nodes = input_nodes
+        self.choicestr2choice: dict[str, ChoiceCaller] = {}
+        for choice in choices:
+            self.choicestr2choice[choice.autoheuristic_id()] = choice
+        choices_str = list(self.choicestr2choice.keys())
+        def fallback_str() -> str:
+            fallback_choice = fallback()
+            if fallback_choice is None:
+                # TODO: Find a nicer way to handle this
+                return "unsure"
+            return fallback_choice.autoheuristic_id()
+        super().__init__(
+            fallback_str,
+            choices_str,
+            None,
+            context,
+            name,
+            augment_context,
+            precondition,
+        )
+        if (
+            torch._inductor.config.collect_autoheuristic(self.name)
+            and self.satisfies_precondition()
+        ):
+            self.register_global_feedback(input_nodes, choices)
+    def register_global_feedback(
+        self, input_nodes: list[Any], choices: list[ChoiceCaller]
+    ) -> None:
+        """
+        Registers a callback in select_algorithm, which is called with the timing of each choice.
+        """
+        from torch._inductor.select_algorithm import (
+            add_feedback_saver,
+            create_inputs_key,
+            create_precompile_key,
+        )
+        def store_global_feedback(
+            ah_inputs_key: str,
+            ah_precompile_key: str,
+            timings: dict[ChoiceCaller, float],
+            name: str,
+            input_nodes: list[Any],
+            choices: list[ChoiceCaller],
+        ) -> None:
+            current_inputs_key = create_inputs_key(input_nodes)
+            if current_inputs_key != ah_inputs_key:
+                return
+            current_precompile_key = create_precompile_key(
+                name, current_inputs_key, choices
+            )
+            if current_precompile_key != ah_precompile_key:
+                return
+            for choice, time in timings.items():
+                self.save_data(choice.autoheuristic_id(), time)
+        inputs_key = create_inputs_key(input_nodes)
+        precompile_key = create_precompile_key(self.name, inputs_key, choices)
+        feedback_saver = partial(store_global_feedback, inputs_key, precompile_key)
+        add_feedback_saver(feedback_saver)
+    def get_choice_caller(self) -> Optional[ChoiceCaller]:
+        choice = self.get_choice()
+        return self.choicestr2choice.get(choice, None)
+    def get_top_k_choices_caller(
+        self, top_k: int, always_included: Optional[list[str]] = None
+    ) -> Optional[list[ChoiceCaller]]:
+        choices = self.get_top_k_choices(top_k, always_included)
+        if choices is None:
+            return None
+        return [self.choicestr2choice[choice] for choice in choices]

.venv/lib/python3.12/site-packages/torch/_inductor/autoheuristic/autoheuristic_utils.py ADDED Viewed

	@@ -0,0 +1,339 @@

+import functools
+from typing import Any, Callable
+import torch
+Feedback = float
+Choice = str
+Value = Any
+CHOICE_COL = "choice"
+FEEDBACK_COL = "feedback"
+class AHFeature:
+    """
+    The context, that AutoHeuristic stores, is a list of features. AutoHeuristic needs to know whether a feature is
+    categorical (i.e., not a continuous variable) to learn a machine learning model.
+    """
+    def __init__(self, name: str, value: Value, is_categorical: bool = False) -> None:
+        self.name = name
+        self.value = value
+        self.is_categorical = is_categorical
+class AHOperation:
+    """
+    AHOperation can be used to augment the data collected by AutoHeuristic.
+    One might for example store features like m, k, n, but also want to use
+    features like m*n, or k*n, to learn a heuristic. Instead of storing features
+    that can be created from the collected data, one can use AHOperation to
+    create new features from the collected data.
+    """
+    def __init__(
+        self, name: str, func: Callable[[Any], Value], is_categorical: bool = False
+    ) -> None:
+        self.name = name
+        self.func = func
+        self.is_categorical = is_categorical
+    def apply_operation(self, data: Any) -> None:
+        data[self.name] = self.func(data)
+class AHContext:
+    """
+    This class is used to specify which information AutoHeuristic should store. For each choice, AutoHeursitic will
+    store the context and the collected feedback. The context could be something like the shape of a tensor, i.e.,
+    information that will help to learn a heuristic.
+    """
+    features: list[AHFeature]
+    context_dict: dict[str, Value]
+    def __init__(self) -> None:
+        self.features = []
+        self.context_dict = {}
+    def add_feature(
+        self, name: str, value: Value, is_categorical: bool = False
+    ) -> None:
+        self.features.append(AHFeature(name, value, is_categorical=is_categorical))
+        self.context_dict[name] = value
+    def get_numerical_and_categorical_features(self) -> tuple[list[str], list[str]]:
+        numerical_features = []
+        categorical_features = []
+        for feature in self.features:
+            if feature.is_categorical:
+                categorical_features.append(feature.name)
+            else:
+                numerical_features.append(feature.name)
+        return numerical_features, categorical_features
+    def get_feature_names_csv(self) -> str:
+        return ",".join(feature.name for feature in self.features)
+    def get_feature_values_csv(self) -> str:
+        return ",".join(str(feature.value) for feature in self.features)
+    def get_value(self, name: str) -> Value:
+        return self.context_dict[name]
+    def apply_operations(self, operations: list[AHOperation]) -> None:
+        for op in operations:
+            op.apply_operation(self.context_dict)
+class AHMetadata:
+    def __init__(
+        self,
+        shared_memory: Any,
+        device_capa: tuple[int, int],
+        choices: list[Choice],
+        name: str,
+    ) -> None:
+        # use amount of shared_memory and device_capability to identify GPU
+        # TODO(AlnisM): there might be a better way to do this
+        self.shared_memory = shared_memory
+        self.device_capa = device_capa
+        self.choices = choices
+        self.name = name
+    def to_dict(self) -> dict[str, Value]:
+        return {
+            "shared_memory": self.shared_memory,
+            "device_capa": self.device_capa,
+            "name": self.name,
+        }
+def get_metadata_str_from_log(log_path: str) -> str:
+    with open(log_path, newline="") as file:
+        json_string = file.readline().strip()
+        return json_string
+def check_minsize(context: AHContext, minsize: int) -> bool:
+    return (
+        context.get_value("m") >= minsize
+        and context.get_value("k") >= minsize
+        and context.get_value("n") >= minsize
+    )
+def pad_mm_precondition(metadata: AHMetadata, context: AHContext) -> bool:
+    if metadata.shared_memory == 166912 and metadata.device_capa == (8, 0):
+        # A100 precondition
+        return check_minsize(context, 512)
+    elif metadata.shared_memory == 232448 and metadata.device_capa == (9, 0):
+        # H100 precondition
+        return check_minsize(context, 768)
+    return True
+def get_mixedmm_precondition(metadata: AHMetadata, context: AHContext) -> bool:
+    m = context.get_value("m")
+    k = context.get_value("k")
+    n = context.get_value("n")
+    if m > 128 or k < 1024 or n < 1024:
+        return False
+    mat1_iscontig = context.get_value("mat1_iscontig")
+    mat2_iscontig = context.get_value("mat2_iscontig")
+    return mat1_iscontig and not mat2_iscontig
+def get_mult_dims_ops() -> list[AHOperation]:
+    m_times_k_op = AHOperation("m*k", lambda data: data["m"] * data["k"])
+    m_times_n_op = AHOperation("m*n", lambda data: data["m"] * data["n"])
+    k_times_n_op = AHOperation("k*n", lambda data: data["k"] * data["n"])
+    return [m_times_k_op, m_times_n_op, k_times_n_op]
+def get_arith_intensity(data: Any) -> float:
+    m = data["m"]
+    k = data["k"]
+    n = data["n"]
+    if m == 0 or k == 0 or n == 0:
+        return 0.0
+    return m * k * n / (m * k + k * n + m * n)
+def pad_mm_operations() -> list[AHOperation]:
+    mult_dims_ops = get_mult_dims_ops()
+    k_div_m_times_n_op = AHOperation(
+        "k/(m*n)", lambda data: data["k"] / (data["m"] * data["n"])
+    )
+    def bfloat_perf_hit(data: Any) -> bool:
+        m = data["m"]
+        k = data["k"]
+        n = data["n"]
+        is_bfloat = str(data["mat1_dtype"]) == "torch.bfloat16"
+        return k > (m * 1024) and k > (n * 1024) and is_bfloat
+    bfloat_perf_hit_op = AHOperation(
+        "bfloat_perf_hit", bfloat_perf_hit, is_categorical=True
+    )
+    arith_intensity_op = AHOperation("arith_intensity", get_arith_intensity)
+    dims_need_padding_ops = get_dims_need_padding_ops()
+    dims_multiple_ops = get_dims_multiple_ops()
+    is_contig_ops = get_is_contig_ops()
+    ah_operations = mult_dims_ops + [
+        k_div_m_times_n_op,
+        bfloat_perf_hit_op,
+        arith_intensity_op,
+    ]
+    ah_operations.extend(dims_need_padding_ops)
+    ah_operations.extend(dims_multiple_ops)
+    ah_operations.extend(is_contig_ops)
+    return ah_operations
+def between_op(data: Any, dim: str, lower: int, upper: int) -> bool:
+    return data[dim] >= lower and data[dim] <= upper
+def between_ops() -> list[AHOperation]:
+    dims = ["m", "k", "n"]
+    limits = [(1, 16), (17, 32), (33, 64), (65, 128), (129, 256)]
+    ah_operations = []
+    for dim in dims:
+        for lower, upper in limits:
+            between_op_fn = functools.partial(
+                between_op, dim=dim, lower=lower, upper=upper
+            )
+            # using 'LEQ' instead of '<=' because '<=' cannot be exported to dot
+            between_op_name = f"{lower}LEQ{dim}LEQ{upper}"
+            ah_operations.append(
+                AHOperation(between_op_name, between_op_fn, is_categorical=True)
+            )
+    return ah_operations
+def pow2_op(data: Any, dim: str, exponent: int) -> bool:
+    return data[dim] == 2**exponent
+def mm_operations() -> list[AHOperation]:
+    mult_dims_ops = get_mult_dims_ops()
+    arith_intensity_op = AHOperation("arith_intensity", get_arith_intensity)
+    return mult_dims_ops + [arith_intensity_op]
+def mixed_mm_operations() -> list[AHOperation]:
+    return mm_operations() + between_ops()
+def is_multiple(data: Any, dim: str, mult: int) -> bool:
+    return data[dim] % mult == 0
+def get_dims_multiple_ops() -> list[AHOperation]:
+    multiples = [2, 4, 8, 16, 32]
+    dims = ["m", "k", "n"]
+    dims_multiple_ops = []
+    for dim in dims:
+        for mult in multiples:
+            is_multiple_fn = functools.partial(is_multiple, dim=dim, mult=mult)
+            dims_multiple_op = AHOperation(
+                f"{dim}_multiple_{mult}", is_multiple_fn, is_categorical=True
+            )
+            dims_multiple_ops.append(dims_multiple_op)
+    return dims_multiple_ops
+def get_dims_need_padding_ops() -> list[AHOperation]:
+    def mat1_innermost_needs_padding_fn(data: Any) -> bool:
+        mat1_stride_0 = data["mat1_stride_0"]
+        mat1_stride_1 = data["mat1_stride_1"]
+        m_padded_length = data["m_padded_length"]
+        k_padded_length = data["k_padded_length"]
+        mat1_innermost_needs_padding = False
+        if mat1_stride_0 == 1 and m_padded_length != 0:
+            mat1_innermost_needs_padding = True
+        if mat1_stride_1 == 1 and k_padded_length != 0:
+            mat1_innermost_needs_padding = True
+        return mat1_innermost_needs_padding
+    mat1_innermost_op = AHOperation(
+        "mat1_innermost_needs_padding",
+        mat1_innermost_needs_padding_fn,
+        is_categorical=True,
+    )
+    def mat2_innermost_needs_padding_fn(data: Any) -> bool:
+        mat2_stride_0 = data["mat2_stride_0"]
+        mat2_stride_1 = data["mat2_stride_1"]
+        k_padded_length = data["k_padded_length"]
+        n_padded_length = data["n_padded_length"]
+        mat2_innermost_needs_padding = False
+        if mat2_stride_0 == 1 and k_padded_length != 0:
+            mat2_innermost_needs_padding = True
+        if mat2_stride_1 == 1 and n_padded_length != 0:
+            mat2_innermost_needs_padding = True
+        return mat2_innermost_needs_padding
+    mat2_innermost_op = AHOperation(
+        "mat2_innermost_needs_padding",
+        mat2_innermost_needs_padding_fn,
+        is_categorical=True,
+    )
+    def num_dims_needs_padding_fn(data: Any) -> int:
+        m_padded_length = data["m_padded_length"]
+        k_padded_length = data["k_padded_length"]
+        n_padded_length = data["n_padded_length"]
+        num_dims_needs_padding = 0
+        if m_padded_length != 0:
+            num_dims_needs_padding += 1
+        if k_padded_length != 0:
+            num_dims_needs_padding += 1
+        if n_padded_length != 0:
+            num_dims_needs_padding += 1
+        return num_dims_needs_padding
+    num_dims_op = AHOperation("num_dims_needs_padding", num_dims_needs_padding_fn)
+    return [mat1_innermost_op, mat2_innermost_op, num_dims_op]
+def get_is_contig_ops() -> list[AHOperation]:
+    def mat1_is_contig_fn(data: Any) -> bool:
+        stride_0 = data["mat1_stride_0"]
+        stride_1 = data["mat1_stride_1"]
+        k = data["k"]
+        return stride_0 == k and stride_1 == 1
+    mat1_is_contig_op = AHOperation(
+        "mat1_iscontig", mat1_is_contig_fn, is_categorical=True
+    )
+    def mat2_is_contig_fn(data: Any) -> bool:
+        stride_0 = data["mat2_stride_0"]
+        stride_1 = data["mat2_stride_1"]
+        n = data["n"]
+        return stride_0 == n and stride_1 == 1
+    mat2_is_contig_op = AHOperation(
+        "mat2_iscontig", mat2_is_contig_fn, is_categorical=True
+    )
+    return [mat1_is_contig_op, mat2_is_contig_op]
+def context_add_strides(context: AHContext, name: str, stride: tuple[int, ...]) -> None:
+    for i, s in enumerate(stride):
+        context.add_feature(f"{name}_stride_{i}", s)
+def context_add_using_tf32(context: AHContext, dtype: torch.dtype) -> None:
+    using_tf32 = "not_float_32"
+    if dtype == torch.float32:
+        using_tf32 = torch.backends.cuda.matmul.allow_tf32
+    context.add_feature("using_tf32", using_tf32, is_categorical=True)

.venv/lib/python3.12/site-packages/torch/_inductor/autoheuristic/learned_heuristic_controller.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import importlib
+import inspect
+import pkgutil
+from collections import defaultdict
+from typing import Any, Optional
+from torch._inductor.autoheuristic.autoheuristic_utils import (
+    AHContext,
+    AHMetadata,
+    Choice,
+)
+from torch._inductor.autoheuristic.learnedheuristic_interface import LearnedHeuristic
+def find_and_instantiate_subclasses(
+    package_name: str, base_class: Any
+) -> list[LearnedHeuristic]:
+    instances = []
+    package = importlib.import_module(package_name)
+    for _, module_name, _ in pkgutil.walk_packages(
+        package.__path__, package.__name__ + "."
+    ):
+        try:
+            module_basename = module_name.split(".")[-1]
+            if not module_basename.startswith("_"):
+                # learned heuristics start with an underscore
+                continue
+            module = importlib.import_module(module_name)
+            # look for classes that are subclasses of base_class
+            for _name, obj in inspect.getmembers(module):
+                if (
+                    inspect.isclass(obj)
+                    and issubclass(obj, base_class)
+                    and obj != base_class
+                ):
+                    instance = obj()
+                    instances.append(instance)
+        except Exception as e:
+            print(f"Error processing module {module_name}: {e}")
+    return instances
+class LearnedHeuristicController:
+    """
+    Class that finds and instantiates all learned heuristics. It also provides
+    a way to get the decision of a learned heuristic.
+    """
+    existing_heuristics: dict[str, list[LearnedHeuristic]] = defaultdict(list)
+    """
+    A dictionary that stores all the learned heuristics for each optimization.
+    The key is the optimization name, and the value is a list of LearnedHeuristic objects.
+    """
+    heuristics_initialized: bool = False
+    """
+    A flag that indicates whether the learned heuristics have been initialized.
+    Set to true when the get_decision() function is called for the first time.
+    """
+    def __init__(
+        self,
+        metadata: AHMetadata,
+        context: AHContext,
+    ) -> None:
+        self.metadata = metadata
+        self.context = context
+    def get_heuristics(self, name: str) -> list[LearnedHeuristic]:
+        """
+        Returns a list of learned heuristics for the given optimization name.
+        """
+        if not LearnedHeuristicController.heuristics_initialized:
+            # learned heuristics are generated into the following package
+            learned_heuristics_package = "torch._inductor.autoheuristic.artifacts"
+            # learned heuristics have to be of type LearnedHeuristic
+            base_class = LearnedHeuristic
+            found_heuristics = find_and_instantiate_subclasses(
+                learned_heuristics_package, base_class
+            )
+            for learned_heuristic in found_heuristics:
+                opt_name = learned_heuristic.get_name()
+                LearnedHeuristicController.existing_heuristics[opt_name].append(
+                    learned_heuristic
+                )
+            LearnedHeuristicController.heuristics_initialized = True
+        return LearnedHeuristicController.existing_heuristics[name]
+    def get_decision(self) -> Optional[Choice]:
+        """
+        Returns the decision made by the learned heuristic or None if no heuristic was found or the heuristic is unsure
+        which choice to make.
+        """
+        heuristics = self.get_heuristics(self.metadata.name)
+        for heuristic in heuristics:
+            if heuristic.check_precondition(self.metadata, self.context):
+                return heuristic.get_decision(self.context, self.metadata.choices)
+        return None
+    def get_decisions_ranked(self, top_k: int) -> Optional[list[Choice]]:
+        heuristics = self.get_heuristics(self.metadata.name)
+        for heuristic in heuristics:
+            if heuristic.check_precondition(self.metadata, self.context):
+                choices = heuristic.get_decisions_ranked(self.context)
+                if choices is None:
+                    return None
+                avail_choices = [
+                    choice for choice in choices if choice in self.metadata.choices
+                ]
+                return avail_choices[:top_k]
+        return None

.venv/lib/python3.12/site-packages/torch/_inductor/autoheuristic/learnedheuristic_interface.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import operator
+from typing import Optional
+from torch._inductor.autoheuristic.autoheuristic_utils import (
+    AHContext,
+    AHMetadata,
+    Choice,
+)
+class LearnedHeuristic:
+    """
+    LearnedHeuristic is a base class for all learned heuristics.
+    """
+    def __init__(self) -> None:
+        pass
+    def check_precondition(
+        self,
+        metadata: AHMetadata,
+        context: AHContext,
+    ) -> bool:
+        return True
+    def get_decision(
+        self, context: AHContext, choices: list[Choice]
+    ) -> Optional[Choice]:
+        return None
+    def get_confidence_threshold(self) -> float:
+        return 1.0
+    def get_name(self) -> str:
+        return ""
+    def get_decisions_ranked(self, context: AHContext) -> Optional[list[str]]:
+        return None
+class LearnedHeuristicRegression(LearnedHeuristic):
+    def __init__(self) -> None:
+        super().__init__()
+    def get_feedback(self, context: AHContext, choice: Choice) -> float:
+        return 1.0
+    def get_decision(
+        self, context: AHContext, choices: list[Choice]
+    ) -> Optional[Choice]:
+        choice2feedback = {}
+        for choice in choices:
+            predicted_feedback = self.get_feedback(context, choice)
+            choice2feedback[choice] = predicted_feedback
+        sorted_choices_feedback = sorted(
+            choice2feedback.items(), key=operator.itemgetter(1)
+        )
+        highest_feedback = sorted_choices_feedback[-1][1]
+        second_highest_feedback = sorted_choices_feedback[-2][1]
+        if highest_feedback / second_highest_feedback > self.get_confidence_threshold():
+            return sorted_choices_feedback[-1][0]
+        # We are not sure which choice is the best one
+        return None
+class LearnedHeuristicDecision(LearnedHeuristic):
+    def __init__(self) -> None:
+        super().__init__()
+    def get_choice(self, idx: int) -> Optional[str]:
+        return None
+    def get_decision(
+        self, context: AHContext, choices: list[Choice]
+    ) -> Optional[Choice]:
+        best_choices = self.get_best_choices(context)
+        if not best_choices:
+            return None
+        (best_choice_proba, best_choice_idx) = best_choices[0]
+        if best_choice_proba <= self.get_confidence_threshold():
+            return None
+        return self.get_choice(best_choice_idx)
+    def get_decisions_ranked(self, context: AHContext) -> Optional[list[str]]:
+        feedback_idx_list = self.get_best_choices(context)
+        if feedback_idx_list is None:
+            return None
+        choices = [
+            self.get_choice(feedback_idx[1]) for feedback_idx in feedback_idx_list
+        ]
+        choices = [choice for choice in choices if choice is not None]
+        return choices
+    def get_best_choices(self, context: AHContext) -> Optional[list[tuple[float, int]]]:
+        return []

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/aoti_hipify_utils.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import re
+import torch
+from torch.utils.hipify.hipify_python import PYTORCH_MAP, PYTORCH_TRIE
+# It is not a good idea to directly apply hipify_torch to codegen, which will be vulnerable to cases like:
+#   "...
+#    from ..codecache import CudaKernelParamCache
+#   ..."
+# In such cases, we do not need to hipify_torch the original class/file name in codegen/codecache
+def maybe_hipify_code_wrapper(source_codes: str, force_hipify: bool = False) -> str:
+    if torch.version.hip is None and not force_hipify:
+        return source_codes
+    def c2_repl(m: re.Match[str]) -> object:
+        return PYTORCH_MAP[m.group(0)]
+    # We need to redefine RE_PYTORCH_PREPROCESSOR here since in hipify_torch,
+    # it will apply positive lookbehind (?<=\W) to the pattern to avoid matching
+    # keyword at the beginning of code line. However, this can happen in codegen,
+    # which will cause the pattern to not match.
+    # Note that lookahead (?=\W) is still needed to keep hipification idomponent, for example
+    # we need to skip replacing "getStreamFromExternal" in "getStreamFromExternalMasqueradingAsCUDA"
+    RE_PYTORCH_PREPROCESSOR = re.compile(rf"({PYTORCH_TRIE.export_to_regex()})(?=\W)")
+    source_codes = RE_PYTORCH_PREPROCESSOR.sub(c2_repl, source_codes)  # type: ignore[arg-type]
+    return source_codes

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/aoti_runtime/interface.cpp ADDED Viewed

	@@ -0,0 +1,443 @@

+// Definition of AOTI runtime interface functions
+#include <torch/csrc/inductor/aoti_runtime/interface.h>
+#include <torch/csrc/inductor/aoti_runtime/model_container.h>
+#include <iostream>
+#include <vector>
+#define CONVERT_EXCEPTION_TO_ERROR_CODE(...)      \
+  try {                                           \
+    __VA_ARGS__                                   \
+  } catch (const std::exception& e) {             \
+    std::cerr << "Error: " << e.what() << '\n';   \
+    return AOTI_RUNTIME_FAILURE;                  \
+  } catch (...) {                                 \
+    std::cerr << "Unknown exception occurred.\n"; \
+    return AOTI_RUNTIME_FAILURE;                  \
+  }                                               \
+  return AOTI_RUNTIME_SUCCESS;
+#define AOTI_VECTOR_SIZE_CHECK(actual_size, expected_size, name)  \
+  do {                                                            \
+    AOTI_RUNTIME_CHECK(                                           \
+        actual_size == expected_size,                             \
+        "expected " + std::string(name) + " vector size to be " + \
+            std::to_string(expected_size) + ", but got " +        \
+            std::to_string(actual_size));                         \
+  } while (0)
+// AOTInductor uses at::addmm_out, which doesn't supports
+// arguments that requires gradient. For this reason, we
+// enforce no_grad context for run APIs.
+//
+// A RAII, thread local (!) guard that enables or disables grad mode upon
+// construction, and sets it back to the original value upon destruction.
+struct AOTINoGradGuard {
+  AOTINoGradGuard() {
+    aoti_torch_grad_mode_set_enabled(false);
+  }
+  AOTINoGradGuard(const AOTINoGradGuard&) = delete;
+  AOTINoGradGuard(AOTINoGradGuard&&) noexcept = delete;
+  ~AOTINoGradGuard() {
+    aoti_torch_grad_mode_set_enabled(prev_mode);
+  }
+  AOTINoGradGuard& operator=(const AOTINoGradGuard&) = delete;
+  AOTINoGradGuard& operator=(AOTINoGradGuard&&) noexcept = delete;
+  bool prev_mode{aoti_torch_grad_mode_is_enabled()};
+};
+extern "C" {
+AOTIRuntimeError AOTInductorModelContainerCreate(
+    AOTInductorModelContainerHandle* container_handle,
+    size_t num_models,
+    bool is_cpu,
+    const char* cubin_dir) {
+      return AOTInductorModelContainerCreateWithDevice(
+        container_handle,
+        num_models,
+        is_cpu ? "cpu" : "cuda",
+        cubin_dir);
+}
+AOTIRuntimeError AOTInductorModelContainerCreateWithDevice(
+    AOTInductorModelContainerHandle* container_handle,
+    size_t num_models,
+    const char* device_str,
+    const char* cubin_dir) {
+  if (num_models == 0) {
+    std::cerr << "Error: num_models must be positive, but got 0\n";
+    return AOTI_RUNTIME_FAILURE;
+  }
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    std::optional<std::string> cubin_dir_opt;
+    if (cubin_dir != nullptr) {
+      cubin_dir_opt.emplace(cubin_dir);
+    }
+    auto* container = new torch::aot_inductor::AOTInductorModelContainer(
+        num_models, std::string(device_str), cubin_dir_opt);
+    *container_handle =
+        reinterpret_cast<AOTInductorModelContainerHandle>(container);
+  })
+}
+AOTIRuntimeError AOTInductorModelContainerDelete(
+    AOTInductorModelContainerHandle container_handle) {
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto* container =
+        reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+            container_handle);
+    delete container;
+  });
+}
+AOTIRuntimeError AOTInductorModelContainerRun(
+    AOTInductorModelContainerHandle container_handle,
+    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
+                                     // are stolen; the array itself is borrowed
+    size_t num_inputs,
+    AtenTensorHandle*
+        output_handles, // array for writing output AtenTensorHandle; handles
+                        // will be stolen by the caller; the array itself is
+                        // borrowed
+    size_t num_outputs,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs");
+  AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs");
+  auto stream =
+      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    container->run(
+        input_handles, output_handles, stream, proxy_executor_handle);
+  })
+}
+AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded(
+    AOTInductorModelContainerHandle container_handle,
+    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
+                                     // are stolen; the array itself is borrowed
+    size_t num_inputs,
+    AtenTensorHandle*
+        output_handles, // array for writing output AtenTensorHandle; handles
+                        // will be stolen by the caller; the array itself is
+                        // borrowed
+    size_t num_outputs,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs");
+  AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs");
+  auto stream =
+      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    container->run_single_threaded(
+        input_handles, output_handles, stream, proxy_executor_handle);
+  })
+}
+AOTIRuntimeError AOTInductorModelContainerGetNumConstants(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* num_constants) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *num_constants = container->num_constants(); })
+}
+AOTIRuntimeError AOTInductorModelContainerGetConstantName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    const char** name) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *name = container->constant_name(idx); })
+}
+AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    const char** original_fqn) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *original_fqn = container->constant_original_fqn(idx); })
+}
+AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    bool* from_folded) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *from_folded = container->constant_from_folded(idx); })
+}
+AOTIRuntimeError AOTInductorModelContainerGetConstantType(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    int32_t* type) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *type = container->constant_type(idx); })
+}
+AOTIRuntimeError AOTInductorModelContainerGetConstantDtype(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    int32_t* dtype) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *dtype = container->constant_dtype(idx); })
+}
+AOTIRuntimeError AOTInductorModelContainerGetConstantDataSize(
+  AOTInductorModelContainerHandle container_handle,
+  size_t idx,
+  size_t* data_size) {
+  auto* container =
+    reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+        container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *data_size = container->constant_data_size(idx); })
+}
+AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto constants_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { const auto ret = container->extract_constants_map(use_inactive);
+      for (const auto& pair: ret) {
+        constants_map->emplace(pair.first, pair.second);
+      }
+    })
+}
+AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive,
+    bool validate_full_update) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->update_constant_buffer(
+        *input_map, use_inactive, validate_full_update, /* user_managed = */ true);
+  })
+}
+AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive,
+    bool validate_full_update) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->update_constant_buffer(
+        *input_map, use_inactive, validate_full_update);
+  })
+}
+AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle) {
+  return AOTInductorModelContainerUpdateConstantBuffer(container_handle,
+          constant_map_handle,
+          /*use_inactive*/ true,
+          /*validate_full_update*/ true);
+}
+AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer(
+    AOTInductorModelContainerHandle container_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->free_inactive_constant_buffer();
+  })
+}
+AOTIRuntimeError AOTInductorModelContainerRunConstantFolding(
+    AOTInductorModelContainerHandle container_handle,
+    bool use_inactive,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto stream =
+      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    container->run_const_fold(use_inactive, stream, proxy_executor_handle);
+  })
+}
+AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer(
+    AOTInductorModelContainerHandle container_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->swap_constant_buffer();
+  })
+}
+AOTIRuntimeError AOTInductorModelContainerGetNumInputs(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* ret_num_inputs) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_num_inputs = container->num_inputs(); })
+}
+AOTIRuntimeError AOTInductorModelContainerGetInputName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t input_idx,
+    const char** ret_input_names) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_input_names = container->input_name(input_idx); })
+}
+AOTIRuntimeError AOTInductorModelContainerGetNumOutputs(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* ret_num_outputs) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_num_outputs = container->num_outputs(); })
+}
+AOTIRuntimeError AOTInductorModelContainerGetOutputName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t output_idx,
+    const char** ret_output_names) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_output_names = container->output_name(output_idx); })
+}
+AOTIRuntimeError AOTInductorModelContainerGetCallSpec(
+    AOTInductorModelContainerHandle container_handle,
+    const char** in_spec,
+    const char** out_spec) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    *in_spec = container->get_in_spec();
+    *out_spec = container->get_out_spec();
+  })
+}
+AOTIRuntimeError AOTInductorModelCreate(
+    AOTInductorModelHandle* model_handle,
+    AOTInductorConstantMapHandle constant_map_handle){
+    CONVERT_EXCEPTION_TO_ERROR_CODE({
+      auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
+      auto constant_array = std::make_shared<std::vector<torch::aot_inductor::ConstantHandle>>();
+      auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+      auto model = new torch::aot_inductor::AOTInductorModel(
+          constant_map,
+          constant_array,
+          "cpu", // device_str is hardcoded, as AOTInductorModelCreate is only use for CPU models
+          ""
+      );
+      if (input_map) {
+        for (auto const& kv : *input_map) {
+          constant_map->emplace(kv.first, kv.second);
+        }
+      } else {
+        model->load_constants();
+      }
+      *model_handle = reinterpret_cast<AOTInductorModelHandle>(model);
+    })}
+AOTIRuntimeError AOTInductorModelRun(
+    AOTInductorModelHandle model_handle,
+    AtenTensorHandle* input_handles,
+    AtenTensorHandle* output_handles) {
+  auto model =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    model->run_impl(
+        input_handles,
+        output_handles,
+        (torch::aot_inductor::DeviceStreamType) nullptr,
+        nullptr);
+  })
+}
+AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle){
+    CONVERT_EXCEPTION_TO_ERROR_CODE({
+      auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(
+          model_handle);
+      delete model;
+    })}
+AOTIRuntimeError AOTInductorModelGetNumOutputs(
+    AOTInductorModelHandle model_handle,
+    size_t* ret_num_outputs) {
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+      auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
+      *ret_num_outputs = model->num_outputs();
+  })
+}
+AOTIRuntimeError AOTInductorModelUpdateConstantsMap(
+    AOTInductorModelHandle model_handle,
+    AOTInductorConstantMapHandle constant_map_handle) {
+  auto model =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
+    auto input_map =
+        reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(
+            constant_map_handle);
+    for (auto const& kv : *input_map) {
+      constant_map->emplace(kv.first, kv.second);
+    }
+    model->update_constants_map(std::move(constant_map));
+  })
+}
+} // extern "C"

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/block_analysis.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import collections
+import functools
+import textwrap
+from typing import Optional
+import sympy
+from sympy import Expr, Symbol
+from torch.utils._sympy.functions import FloorDiv, ModularIndexing
+from ..utils import sympy_dot, sympy_subs
+from ..virtualized import V
+class BlockPatternMatcher:
+    """
+    Matches block indexing expressions.
+    """
+    @classmethod
+    def get_subexpr_involving_symbol(cls, expr: Expr, symbol: Symbol) -> Expr:
+        """
+        Given a sympy expression, return the subexpression comprised only of terms
+        involving the specified symbol.
+        For example, if `expr` is `x * 5 + x ** 2 + y * 2 + 5`, and `symbol` is `x`,
+        this returns `x * 5 + x ** 2`.
+        """
+        expr = cls._preprocess(expr)
+        return sympy.S.Zero + sum(
+            term for term in sympy.Add.make_args(expr) if symbol in term.free_symbols
+        )
+    @staticmethod
+    def get_slice_numels(dims: list[Expr]) -> list[Expr]:
+        """
+        Compute the cumulative size of each dimension's slice.
+        This proceeds from the last dim up to the second.
+        """
+        numels = collections.deque([sympy.S.One])
+        for dim in dims[:0:-1]:
+            numel = dim * numels[0]
+            numels.appendleft(numel)
+        return [*numels]
+    @staticmethod
+    def _preprocess(expr: Expr) -> Expr:
+        # Remove any Identity nodes, e.g. expand x + (5 * y) to x + 5 * y.
+        return expr.expand(identity=True)
+    @classmethod
+    def match_mod_div_block_expr(
+        cls,
+        index: Expr,
+        index_var: Symbol,
+        numel: Expr,
+        num_dims: int,
+    ) -> Optional[tuple[list[Expr], list[Expr], list[Expr]]]:
+        """
+        Matches modular indexing expressions, converting them to implied block dimensions and strides.
+        See triton.py for more information.
+        """
+        index = cls._preprocess(index)
+        # Pattern match to find the strides and offset.
+        wild = functools.partial(sympy.Wild, exclude=[index_var])
+        dims: list[Expr] = [wild(f"dim_mod{idx}") for idx in range(num_dims)]
+        strides: list[Expr] = [wild(f"stride_mod{idx}") for idx in range(num_dims)]
+        # The first dimension's index is computed by division.
+        # The remaining are computed by modulo.
+        slice_numels = cls.get_slice_numels(dims[:num_dims])
+        block_index_exprs = [FloorDiv(index_var, slice_numels[0])] + [
+            ModularIndexing(index_var, numel, dim)
+            for dim, numel in zip(dims[1:], slice_numels[1:])
+        ]
+        # Calculate a linear index from block indices.
+        match_expr = sympy_dot(strides, block_index_exprs)
+        # Heuristic: if the number of dimensions is high, check that the minimum requirements
+        # are met before attempting an expensive full match. see triton.py:match_mod_div_block
+        # for more details. In short, here we check that each subexpression in sympy.Add contains
+        # only FloorDiv or ModularIndexing expressions.
+        if num_dims >= 5:
+            stride, denom, other = sympy.symbols("stride denominator other", cls=wild)
+            mod_div_pattern = stride * ModularIndexing(index_var, denom, other)
+            floor_div_pattern = stride * FloorDiv(index_var, denom)
+            first_dim_floor_div_matched = False
+            match_failed = False
+            for arg in sympy.Add.make_args(index):
+                if arg.match(floor_div_pattern):
+                    # There should only be a single FloorDiv(index, denom) expression
+                    # corresponding to the first dimension
+                    if first_dim_floor_div_matched:
+                        match_failed = True
+                        break
+                    first_dim_floor_div_matched = True
+                elif arg.match(mod_div_pattern):
+                    continue
+                else:
+                    match_failed = True
+                    break
+            if match_failed:
+                return None
+        # Pattern match.
+        match = index.match(match_expr)
+        if match is None:
+            return None
+        # Provide default values for unmatched dims and strides.
+        for dim in dims[1:]:
+            if dim not in match:
+                match[dim] = sympy.S.One
+        for stride in strides[1:]:
+            if stride not in match:
+                match[stride] = sympy.S.Zero
+        sizevars = V.graph.sizevars
+        def get_match(expr: Expr) -> Expr:
+            return sizevars.lookup_precomputed_size(match[expr])
+        # Replace wildcards with matched expressions.
+        dims = [dims[0]] + [get_match(dim) for dim in dims[1:]]
+        strides = [get_match(stride) for stride in strides]
+        slice_numels = cls.get_slice_numels(dims)
+        block_index_exprs = [sympy_subs(expr, match) for expr in block_index_exprs]
+        # The leading dimension is not directly matched in our expression.
+        # We solve for it by dividing the range tree numel by the product of
+        # all other dimensions. We quit if they are not known to be divisible.
+        assert dims[0] not in match, "Expected not to match the leading dimension!"
+        if not sizevars.statically_known_multiple_of(numel, slice_numels[0]):
+            return None
+        dims[0] = numel / slice_numels[0]
+        # Sanity check that we can recover the index from the matched subexpressions.
+        matched_index = sympy_dot(strides, block_index_exprs)
+        assert sizevars.statically_known_equals(
+            # New precomputed replacements may be generated when the `get_match` function
+            # above is called, but the `index` that is being matched has not been updated.
+            # So remove them when checking for equivalence e.g. if ps0=3*s0 and
+            # index=3*s0*expr, matched_index=ps0*expr, then index == matched_index
+            sizevars.remove_precomputed_replacements(matched_index),
+            sizevars.remove_precomputed_replacements(index),
+        ), textwrap.dedent(
+            f"""
+            Invalid match!
+            Index: {index}
+            Matched expression: {matched_index}
+            """
+        )
+        return dims, strides, block_index_exprs
+    @classmethod
+    def match_affine_block_expr(
+        cls,
+        index: Expr,
+        index_var: Symbol,
+    ) -> Optional[Expr]:
+        """
+        Matches simple expressions of the form stride * index, returning the
+        stride.
+        """
+        index = cls._preprocess(index)
+        stride = sympy.Wild("stride", exclude=[index_var])
+        m = index.match(index_var * stride)
+        if m is None:
+            return None
+        return m[stride]

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/common.py ADDED Viewed

	@@ -0,0 +1,2691 @@

+from __future__ import annotations
+import atexit
+import contextlib
+import dataclasses
+import enum
+import functools
+import itertools
+import logging
+import math
+import operator
+import os
+import re
+import tempfile
+from abc import ABC, abstractmethod
+from enum import auto, Enum
+from itertools import chain
+from typing import (
+    Any,
+    Callable,
+    cast,
+    ClassVar,
+    Generic,
+    NamedTuple,
+    Optional,
+    TYPE_CHECKING,
+    Union,
+)
+from typing_extensions import Self, TypeVar
+import sympy
+import torch
+import torch.fx
+from torch._prims_common import ELEMENTWISE_TYPE_PROMOTION_KIND
+from torch.utils import _pytree as pytree
+from torch.utils._ordered_set import OrderedSet
+from torch.utils._sympy.numbers import int_oo
+from torch.utils._sympy.printers import PythonPrinter as _PythonPrinter
+from torch.utils._sympy.symbol import free_symbol_is_type, symbol_is_type, SymT
+from torch.utils._sympy.value_ranges import bound_sympy, ValueRanges
+from .. import config, metrics
+from ..dtype_propagation import DtypePropagationOpsHandler
+from ..ops_handler import BasicMathOpsMixin, DefaultHandler
+from ..utils import (
+    boolean_ops,
+    DeferredLineBase,
+    generate_assert,
+    get_current_backend,
+    IndentedBuffer,
+    ir_dataclass,
+    ScopedDict,
+    sympy_dot,
+    sympy_index_symbol,
+    sympy_subs,
+    triton_type,
+    unique,
+)
+from ..virtualized import ops, OpsHandler, OpsValue, ReductionType, StoreMode, V
+if TYPE_CHECKING:
+    from collections.abc import Iterator, MutableMapping, Sequence
+    from torch.fx import GraphModule
+    from ..custom_graph_pass import CustomGraphModulePass
+    from ..ir import Buffer, ChoiceCaller, FixedLayout, IRNode
+    from ..loop_body import LoopBody
+    from ..scheduler import BaseScheduling, Scheduler, SchedulerNode
+    from .wrapper import PythonWrapperCodegen
+    _T = TypeVar("_T")
+    SchedulingConstructor = Callable[[Optional[Scheduler]], BaseScheduling]
+    WrapperConstructor = type[PythonWrapperCodegen]
+    SymbolLike = Union[str, sympy.Symbol]
+    # OpVarT should really be Union[CSEVariable, str], however this
+    # causes typing errors in subclasses (defined in other files).
+    OpVarT = str
+schedule_log = torch._logging.getArtifactLogger(__name__, "schedule")
+log = logging.getLogger(__name__)
+def data_type_logger(msg: str) -> None:
+    if schedule_log.isEnabledFor(logging.DEBUG):
+        schedule_log.debug("Data type propagation: %s", msg)
+@dataclasses.dataclass
+class FileBackedGraphModule:
+    """
+    Output of FX wrapper codegen. Exposes the same methods as ModuleType, but these
+    map back to a GraphModule instead of Python source.
+    """
+    gm: GraphModule
+    compiled_fn: Callable[..., Any]
+    def __post_init__(self) -> None:
+        # Write the code to a file for compatibility with debugging utilities.
+        # The file is deleted upon program termination.
+        self.tempfile = tempfile.NamedTemporaryFile(
+            mode="w+", suffix=".py", delete=False
+        )
+        atexit.register(os.remove, self.tempfile.name)
+        with self.tempfile as f:
+            f.write(self.value)
+    @property
+    def __file__(self) -> str:
+        return self.tempfile.name
+    def call(self, args: list[Any]) -> Any:
+        return self.compiled_fn(*args)
+    @property
+    def value(self) -> str:
+        return self.gm.code
+class WorkspaceZeroMode(enum.Enum):
+    UNINITIALIZED = 0
+    ZERO_ON_CALL = 1  # kernel may leave workspace dirty
+    ZERO_PER_GRAPH = 2  # must be re-zeroed by kernel
+    @staticmethod
+    def combine(a: WorkspaceZeroMode, b: WorkspaceZeroMode) -> WorkspaceZeroMode:
+        if a == b or b == WorkspaceZeroMode.UNINITIALIZED:
+            return a
+        if a == WorkspaceZeroMode.UNINITIALIZED:
+            return b
+        raise NotImplementedError(f"WorkspaceZeroMode.combine({a!r}, {b!r})")
+    @staticmethod
+    def from_bool(zero_fill: bool) -> WorkspaceZeroMode:
+        if zero_fill:
+            return WorkspaceZeroMode.ZERO_ON_CALL
+        return WorkspaceZeroMode.UNINITIALIZED
+class CodegenSymbol(ABC):
+    """
+    An IR object possibly corresponding to a variable in the wrapper code.
+    """
+    @abstractmethod
+    def get_name(self) -> str:
+        pass
+    @abstractmethod
+    def get_example(self) -> Union[torch.Tensor, sympy.Symbol]:
+        pass
+@ir_dataclass(frozen=True)
+class WorkspaceArg(CodegenSymbol):
+    """A temporary buffer used for a single kernel, then discarded.
+    Not registered as a traditional buffer since there are no users,
+    so it would be dead code eliminated.
+    Args:
+        nbytes: The size of the buffer in bytes.
+        zero_fill: Whether the buffer should be initialized to zero.
+    """
+    count: sympy.Expr
+    zero_mode: WorkspaceZeroMode
+    device: torch.device
+    outer_name: str
+    inner_name: str = "ws_ptr"
+    dtype: torch.dtype = torch.uint8
+    @staticmethod
+    def unique_name(prefix: str = "workspace_") -> str:
+        return f"{prefix}{next(V.graph.workspace_id)}"
+    @staticmethod
+    def can_join(a: WorkspaceArg, b: WorkspaceArg) -> bool:
+        return (
+            a.inner_name == b.inner_name and a.dtype == b.dtype and a.device == b.device
+        )
+    @staticmethod
+    def join(a: WorkspaceArg, b: WorkspaceArg) -> WorkspaceArg:
+        return WorkspaceArg(
+            count=a.count + b.count,
+            zero_mode=WorkspaceZeroMode.combine(a.zero_mode, b.zero_mode),
+            dtype=a.dtype,
+            device=a.device,
+            inner_name=a.inner_name,
+            outer_name=a.outer_name,
+        )
+    @staticmethod
+    def maximum(a: WorkspaceArg, b: WorkspaceArg) -> WorkspaceArg:
+        assert (
+            a.dtype == b.dtype and a.device == b.device and a.inner_name == b.inner_name
+        )
+        return WorkspaceArg(
+            count=sympy.Max(a.count, b.count),
+            zero_mode=WorkspaceZeroMode.combine(a.zero_mode, b.zero_mode),
+            dtype=a.dtype,
+            device=a.device,
+            inner_name=a.inner_name,
+            outer_name=a.outer_name,
+        )
+    # These methods let WorkspaceArg pretend it is a buffer to reuse allocation code
+    def get_device(self) -> torch.device:
+        return self.device
+    get_device_or_error = get_device
+    def get_dtype(self) -> torch.dtype:
+        return self.dtype
+    def get_example(self) -> Union[torch.Tensor, sympy.Symbol]:
+        return self.get_layout().get_example()
+    def get_layout(self) -> FixedLayout:
+        from ..ir import FixedLayout
+        return FixedLayout(
+            device=self.device,
+            dtype=self.dtype,
+            size=[self.count],
+            stride=[1],
+        )
+    @property
+    def layout(self) -> FixedLayout:
+        return self.get_layout()
+    get_output_spec = get_layout
+    maybe_get_output_spec = get_layout
+    maybe_get_layout = get_layout
+    def get_offset(self) -> sympy.Expr:
+        return sympy.S.Zero
+    def get_size(self) -> list[sympy.Expr]:
+        return [self.count]
+    def get_stride(self) -> list[sympy.Expr]:
+        return [sympy.S.One]
+    def get_name(self) -> str:
+        return self.outer_name
+    def get_inputs_that_alias_output(self) -> list[str]:
+        return []
+class TritonScratchWorkspace:
+    def __init__(self, size: int, generate_dtype_str: Callable[..., str]):
+        self.size = size
+        self._generate_dtype_str = generate_dtype_str
+    def generate_dtype_str(self) -> str:
+        return self._generate_dtype_str()
+@dataclasses.dataclass
+class TensorArg:
+    name: str
+    buffer: str
+    dtype: torch.dtype
+    offset: sympy.Expr = sympy.S.Zero  # c++ only
+    alias_of: Optional[str] = None  # halide only
+@dataclasses.dataclass
+class SizeArg:
+    name: str
+    expr: sympy.Expr
+    @property
+    def alias_of(self) -> Optional[str]:
+        return None
+@dataclasses.dataclass
+class ConstexprArg:
+    name: str
+@dataclasses.dataclass
+class TMADescriptorArg:
+    name: str
+    api_type: str  # "experimental" or "stable"
+    block_shape: Optional[list[sympy.Expr]]  # only needed for "stable"
+    dtype: Optional[torch.dtype]  # only needed for "stable"
+@dataclasses.dataclass
+class DeviceCodegen:
+    scheduling: SchedulingConstructor
+    wrapper_codegen: WrapperConstructor
+    cpp_wrapper_codegen: Optional[WrapperConstructor] = None
+KernelArgType = Union[WorkspaceArg, TensorArg, SizeArg, TMADescriptorArg, ConstexprArg]
+device_codegens: dict[str, DeviceCodegen] = {}
+class DeviceOpOverrides:
+    def import_get_raw_stream_as(self, name: str) -> str:
+        raise NotImplementedError
+    def set_device(self, device_idx: int) -> str:
+        raise NotImplementedError
+    def synchronize(self) -> str:
+        raise NotImplementedError
+    def device_guard(self, device_idx: int) -> str:
+        raise NotImplementedError
+    def cpp_device_guard(self) -> str:
+        raise NotImplementedError
+    def cpp_aoti_device_guard(self) -> str:
+        raise NotImplementedError
+    def cpp_stream_guard(self) -> str:
+        raise NotImplementedError
+    def cpp_aoti_stream_guard(self) -> str:
+        raise NotImplementedError
+    def cpp_getStreamFromExternal(self) -> str:
+        raise NotImplementedError
+    def kernel_header(self) -> str:
+        raise NotImplementedError
+    def kernel_driver(self) -> str:
+        raise NotImplementedError
+    def cpp_stream_type(self) -> str:
+        raise NotImplementedError
+    def aoti_get_stream(self) -> str:
+        raise NotImplementedError
+    def cpp_kernel_type(self) -> str:
+        raise NotImplementedError
+    def cpp_device_ptr(self) -> str:
+        raise NotImplementedError
+    def tma_descriptor_helpers(self) -> str:
+        raise NotImplementedError
+    def cpp_global_scratch(
+        self, idx: int, workspace: TritonScratchWorkspace
+    ) -> Optional[tuple[list[str], str]]:
+        # optionally return (scratch definition, arg name)
+        raise NotImplementedError
+device_op_overrides_dict: dict[str, DeviceOpOverrides] = {}
+custom_backend_passes: dict[str, Optional[CustomGraphModulePass]] = {}
+# The code generated by Inductor consists of two main parts: kernel code and wrapper code.
+# For any new backend looking to integrate with Inductor, customization of these two main
+# parts are necessary to generate its specific code.
+#
+# Kernel code generation is determined by different Scheduling. Consequently, a new
+# backend needs to provide a custom Scheduling for its unique kernel code generation. Currently,
+# CppScheduling and TritonScheduling serve the C++/OpenMP and Triton backends, respectively.
+#
+# For the Wrapper, Inductor provides a PythonWrapperCodegen class to generate the Python wrapper code
+# that bridges kernels. This allows out-of-tree backends to inherit from PythonWrapperCodegen,
+# and override specific member functions to create backend-specific Python wrapper code.
+#
+# Other classes, such as CppKernel and TritonKernel, used for code generation, typically form part
+# of the logic for either Scheduling or PythonWrapperCodegen. So the Scheduling and PythonWrapperCodegen interfaces
+# provide flexibility to the backend. A backend can choose to implement these classes from scratch,
+# or reuse them by extending and overriding as necessary. And Inductor provides the registration API,
+# register_backend_for_device, to equip a new backend at runtime.
+#
+# Intel has developed a new backend on top of Triton to support Intel GPUs, leveraging these interfaces.
+# This backend can be used as a reference:
+# https://github.com/intel/intel-extension-for-pytorch/blob/5dcc9d57e5422cf295e1a1ee97896d6b6a554a85/intel_extension_for_pytorch/_inductor/__init__.py#L9
+def register_backend_for_device(
+    device: str,
+    device_scheduling: SchedulingConstructor,
+    device_wrapper_codegen: WrapperConstructor,
+    device_cpp_wrapper_codegen: Optional[WrapperConstructor] = None,
+    device_custom_pass: Optional[CustomGraphModulePass] = None,
+) -> None:
+    device_codegens[device] = DeviceCodegen(
+        device_scheduling, device_wrapper_codegen, device_cpp_wrapper_codegen
+    )
+    custom_backend_passes[device] = device_custom_pass
+class BackendFeature(Enum):
+    FOREACH = auto()
+    BUCKETIZE = auto()
+    INPLACE_BUFFERS = auto()
+    MASKED_SCATTER_WITH_INDEX = auto()
+    SCAN = auto()
+    SORT = auto()
+    TUPLE_REDUCTION = auto()
+    PREFER_STORE_LOOP_ORDER = auto()
+    TRITON_TEMPLATES = auto()
+    REDUCE_TO_SINGLE_ELEMENT = auto()
+def get_backend_features(
+    device: Union[torch.device, str, None],
+) -> OrderedSet[BackendFeature]:
+    if device is None:
+        return OrderedSet()
+    init_backend_registration()
+    if isinstance(device, torch.device):
+        device_type = device.type
+    else:
+        assert isinstance(device, str), type(device)
+        device_type = device
+        device = torch.device(device_type)
+    scheduling_ctor = get_scheduling_for_device(device_type)
+    assert scheduling_ctor
+    scheduling = scheduling_ctor(None)
+    return scheduling.get_backend_features(device)
+def has_backend_feature(
+    device: Union[torch.device, str, None], feature: BackendFeature
+) -> bool:
+    """See also V.graph.has_feature"""
+    assert isinstance(feature, BackendFeature)
+    return feature in get_backend_features(device)
+def get_scheduling_for_device(device: str) -> Optional[SchedulingConstructor]:
+    return device_codegens[device].scheduling if device in device_codegens else None
+def get_wrapper_codegen_for_device(
+    device: str, cpp_wrapper: bool = False
+) -> Optional[WrapperConstructor]:
+    if device in device_codegens:
+        wrapper_codegen_obj: DeviceCodegen = device_codegens[device]
+        return (
+            wrapper_codegen_obj.cpp_wrapper_codegen
+            if cpp_wrapper
+            else wrapper_codegen_obj.wrapper_codegen
+        )
+    return None
+def get_custom_backend_pass_for_device(device: str) -> Optional[CustomGraphModulePass]:
+    return custom_backend_passes[device] if device in custom_backend_passes else None
+@functools.cache
+def init_backend_registration() -> None:
+    from .cpp import CppScheduling
+    from .cpp_wrapper_cpu import CppWrapperCpu
+    from .cpp_wrapper_cpu_array_ref import CppWrapperCpuArrayRef
+    from .cpp_wrapper_gpu import CppWrapperGpu
+    from .cpp_wrapper_mps import CppWrapperMps
+    from .cuda_combined_scheduling import CUDACombinedScheduling
+    from .halide import HalideScheduling
+    from .mps import MetalScheduling
+    from .triton import TritonScheduling
+    from .wrapper import PythonWrapperCodegen
+    if get_scheduling_for_device("cpu") is None:
+        cpu_backends = {
+            "cpp": CppScheduling,
+            "halide": HalideScheduling,
+            "triton": TritonScheduling,
+        }
+        register_backend_for_device(
+            "cpu",
+            lambda scheduling: cpu_backends[config.cpu_backend](scheduling),
+            PythonWrapperCodegen,
+            CppWrapperCpuArrayRef
+            if config.aot_inductor.allow_stack_allocation
+            else CppWrapperCpu,
+        )
+    if get_scheduling_for_device("cuda") is None:
+        # CUDACombinedScheduling combines Triton and CUDA C++ scheduling for CUDA devices via delegation
+        cuda_backends = {
+            "triton": CUDACombinedScheduling,
+            "halide": HalideScheduling,
+        }
+        register_backend_for_device(
+            "cuda",
+            lambda scheduling: cuda_backends[config.cuda_backend](scheduling),
+            PythonWrapperCodegen,
+            CppWrapperGpu,
+        )
+    if get_scheduling_for_device("xpu") is None:
+        register_backend_for_device(
+            "xpu",
+            TritonScheduling,
+            PythonWrapperCodegen,
+            CppWrapperGpu,
+        )
+    if get_scheduling_for_device("mps") is None:
+        register_backend_for_device(
+            "mps",
+            MetalScheduling,
+            PythonWrapperCodegen,
+            CppWrapperMps,
+        )
+    private_backend = torch._C._get_privateuse1_backend_name()
+    if (
+        private_backend != "privateuseone"
+        and get_scheduling_for_device(private_backend) is None
+    ):
+        from torch.utils.backend_registration import _get_custom_mod_func
+        try:
+            device_scheduling = _get_custom_mod_func("Scheduling")
+            wrapper_codegen = _get_custom_mod_func("PythonWrapperCodegen")
+            cpp_wrapper_codegen = _get_custom_mod_func("CppWrapperCodegen")
+            if device_scheduling and wrapper_codegen and cpp_wrapper_codegen:
+                register_backend_for_device(
+                    private_backend,
+                    device_scheduling,
+                    wrapper_codegen,
+                    cpp_wrapper_codegen,
+                )
+        except RuntimeError:
+            pass
+def index_prevent_reordering(
+    index: Sequence[sympy.Expr],
+    index_vars: Sequence[sympy.Expr],
+    sizes: Sequence[sympy.Expr],
+) -> list[sympy.Expr]:
+    from ..ir import FlexibleLayout
+    # added contiguous index prevents reordering
+    return [*index, sympy_dot(index_vars, FlexibleLayout.contiguous_strides(sizes))]
+def register_device_op_overrides(
+    device: str, device_op_overrides: DeviceOpOverrides
+) -> None:
+    device_op_overrides_dict[device] = device_op_overrides
+def get_device_op_overrides(device: str) -> DeviceOpOverrides:
+    assert isinstance(device, str), type(device)
+    if not device_op_overrides_dict:
+        from . import cpu_device_op_overrides, mps_device_op_overrides  # noqa: F401
+        from .cuda import device_op_overrides  # noqa: F401
+        from .xpu import device_op_overrides as xpu_op_overrides  # noqa: F401
+    return device_op_overrides_dict[device]
+DTYPE_TO_COMPUTATION_DTYPE: dict[torch.dtype, torch.dtype] = {
+    torch.bfloat16: torch.float,
+    torch.float16: torch.float,
+    **{
+        dtype: dtype
+        for dtype in [
+            torch.bool,
+            torch.float32,
+            torch.float64,
+            torch.int8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+            torch.uint8,
+            torch.uint16,
+            torch.uint32,
+            torch.uint64,
+        ]
+    },
+}
+def deduce_output_dtype_by_name(
+    op_name: str,
+    *args: Any,
+    **kwargs: Any,
+) -> Optional[torch.dtype]:
+    """
+    Given op name and a list of input dtypes, deduce the output dtype
+    """
+    if op_name in boolean_ops():
+        return torch.bool
+    elif op_name in (
+        "to_dtype",
+        "index_expr",
+    ):
+        return kwargs["dtype"] if "dtype" in kwargs else args[-1]
+    elif op_name in (
+        "rand",
+        "randn",
+    ):
+        return torch.float
+    elif op_name in (
+        "get_index",
+        "randint64",
+        "load_seed",
+    ):
+        return torch.int64
+    elif op_name == "reduction":
+        return kwargs["dtype"] if "dtype" in kwargs else args[1]
+    elif op_name == "constant":
+        return kwargs["dtype"] if "dtype" in kwargs else args[-1]
+    elif op_name in (
+        "load",
+        "store",
+        "store_reduction",
+    ):
+        buf_name = args[1]
+        return V.graph.get_dtype(buf_name)  # type: ignore[arg-type]
+    elif op_name == "to_dtype_bitcast":
+        return kwargs["dtype"] if "dtype" in kwargs else args[-2]
+    return None
+def check_dtype(
+    buffer: IndentedBuffer, var: CSEVariableType, dtype: torch.dtype
+) -> None:
+    backend = get_current_backend()
+    if config.test_configs.runtime_triton_dtype_assert and backend == "triton":
+        buffer.writeline(f"tl.static_assert({var}.dtype == {triton_type(dtype)})")
+    elif config.test_configs.static_cpp_dtype_assert and backend == "cpp":
+        from .cpp_utils import CppCSEVariable, DTYPE_TO_CPP
+        assert isinstance(var, CppCSEVariable), type(var)
+        if dtype == torch.bool:
+            if var.is_vec:
+                is_same_dt = f"IsVecMaskType<decltype({var})>::value"
+            else:
+                # operator&(bool, bool) returns int and it can be used as boolean in C++
+                is_same_dt = f"std::is_same_v<decltype({var}), bool> || std::is_same_v<decltype({var}), int>"
+        else:
+            c_var_type = f"decltype({var})"
+            if var.is_vec:
+                c_var_type = f"typename {c_var_type}::value_type"
+            is_same_dt = f"std::is_same_v<{c_var_type}, {DTYPE_TO_CPP[dtype]}>"
+        buffer.writeline(f"static_assert({is_same_dt});")
+class DataTypePropagation:
+    def __init__(self, body: LoopBody) -> None:
+        self.body = body
+        self.graphs: dict[Union[Callable[..., Any], str], Any] = {
+            "root": body.root_block.graph
+        }
+        for k, v in body.subblocks.items():
+            self.graphs[k] = v.graph
+    def deduce_node_dtype_by_inputs(self, node: torch.fx.Node) -> Optional[torch.dtype]:
+        inputs = node.all_input_nodes
+        input_nodes = [
+            n for n in inputs if isinstance(n, torch.fx.Node) and n.op != "placeholder"
+        ]
+        if len(input_nodes) == 0:
+            return None
+        all_input_nodes_propagated = all(
+            OptimizationContext.key in n.meta
+            and n.meta[OptimizationContext.key].dtype is not None
+            for n in input_nodes
+        )
+        if not all_input_nodes_propagated:
+            return None
+        return functools.reduce(
+            torch.promote_types,
+            [n.meta[OptimizationContext.key].dtype for n in input_nodes],
+        )
+    def deduce_node_dtype_by_subgraph(self, node: torch.fx.Node) -> torch.dtype:
+        sub_graph = self.graphs[node.target]
+        dtype = self.propagate_graph(sub_graph)
+        assert dtype
+        return dtype
+    def deduce_node_dtype(self, node: torch.fx.Node) -> Optional[torch.dtype]:
+        if node.op == "placeholder":
+            return None
+        if node.target == "output" and len(node.args) != 1:
+            # we can infer output node if it only have 1 arg
+            return None
+        if node.target == operator.getitem:
+            node_arg = node.args[0]
+            assert isinstance(node_arg, torch.fx.Node), type(node_arg)
+            return self.deduce_node_dtype(node_arg)
+        assert isinstance(node.target, str), type(node.target)
+        if node.target.startswith("masked_subblock"):
+            return self.deduce_node_dtype_by_subgraph(node)
+        if (
+            output_dtype := deduce_output_dtype_by_name(
+                node.target,
+                *node.args,
+                **node.kwargs,
+            )
+        ) is not None:
+            return output_dtype
+        return self.deduce_node_dtype_by_inputs(node)
+    def propagate_graph(self, graph: torch.fx.Graph) -> Optional[torch.dtype]:
+        assert graph.nodes
+        graph_dtype: Optional[torch.dtype] = None
+        # For masked_subblock, we use output's dtype to represent
+        # the dtype of this subgraph. For other cases, graph_dtype
+        # might be None
+        for node in graph.nodes:
+            if OptimizationContext.key in node.meta:
+                opt_ctx = node.meta[OptimizationContext.key]
+            else:
+                opt_ctx = OptimizationContext()
+            opt_ctx.dtype = self.deduce_node_dtype(node)
+            node.meta[OptimizationContext.key] = opt_ctx
+            if node.target == "output":
+                graph_dtype = opt_ctx.dtype
+        return graph_dtype
+    def propagate(self) -> Optional[torch.dtype]:
+        return self.propagate_graph(self.graphs["root"])
+    @classmethod
+    def propagate_loopbody(cls, body: LoopBody) -> Optional[torch.dtype]:
+        return cls(body).propagate()
+    @classmethod
+    def propagate_scheduler_node(cls, node: SchedulerNode) -> Optional[torch.dtype]:
+        from ..loop_body import LoopBody
+        from ..scheduler import SchedulerNode
+        assert isinstance(node, SchedulerNode), type(node)
+        assert isinstance(node._body, LoopBody), type(node._body)
+        return DataTypePropagation.propagate_loopbody(node._body)
+class PythonPrinter(_PythonPrinter):
+    def doprint(
+        self, expr: sympy.Expr, *, simplify: bool = True, p: bool = True
+    ) -> str:
+        # TODO: why are people passing strings to the printer here :think:
+        if simplify and isinstance(expr, sympy.Expr) and hasattr(V.graph, "sizevars"):
+            expr = V.graph.sizevars.simplify(expr)
+        return super().doprint(expr)
+class OpDecompositions:
+    """
+    Decomposes inductor ops
+    """
+    @staticmethod
+    def identity(value: OpVarT) -> OpVarT:
+        # used to trigger cse
+        return value
+    @staticmethod
+    def reciprocal(x: OpVarT) -> OpVarT:
+        return ops.truediv(ops.constant(1, torch.int32), x)
+    @staticmethod
+    def square(x: OpVarT) -> OpVarT:
+        return ops.mul(x, x)
+    @staticmethod
+    def erfc(x: OpVarT) -> OpVarT:
+        return ops.sub(ops.constant(1, torch.float32), ops.erf(x))
+    @staticmethod
+    def erfcx(x: OpVarT) -> OpVarT:
+        return ops.mul(ops.exp(ops.square(x)), ops.erfc(x))
+    @staticmethod
+    def expm1(x: OpVarT) -> OpVarT:
+        return ops.sub(ops.exp(x), ops.constant(1, torch.float32))
+    @staticmethod
+    def log10(x: OpVarT) -> OpVarT:
+        return ops.mul(ops.log(x), ops.constant(1 / math.log(10), torch.float32))
+    @staticmethod
+    def log2(x: OpVarT) -> OpVarT:
+        return ops.mul(ops.log(x), ops.constant(1 / math.log(2), torch.float32))
+    @staticmethod
+    def exp2(x: OpVarT) -> OpVarT:
+        return ops.exp(ops.mul(x, ops.constant(math.log(2), torch.float32)))
+    @staticmethod
+    def log1p(x: OpVarT) -> OpVarT:
+        return ops.log(ops.add(x, ops.constant(1, torch.int32)))
+    @staticmethod
+    def sigmoid(x: OpVarT) -> OpVarT:
+        one = ops.constant(1, torch.int32)
+        return ops.truediv(one, ops.add(one, ops.exp(ops.neg(x))))
+    @staticmethod
+    def relu(x: OpVarT) -> OpVarT:
+        return ops.maximum(x, ops.constant(0, torch.int32))
+    @staticmethod
+    def fma(x: OpVarT, y: OpVarT, z: OpVarT) -> OpVarT:
+        # for backends that don't override this (halide)
+        return ops.add(ops.mul(x, y), z)
+    @staticmethod
+    def floor_to_int(a: OpVarT, dtype: torch.dtype) -> OpVarT:
+        return ops.to_dtype(ops.floor(a), dtype)
+    @staticmethod
+    def ceil_to_int(a: OpVarT, dtype: torch.dtype) -> OpVarT:
+        return ops.to_dtype(ops.ceil(a), dtype)
+    @staticmethod
+    def trunc_to_int(a: OpVarT, dtype: torch.dtype) -> OpVarT:
+        return ops.to_dtype(ops.trunc(a), dtype)
+    @staticmethod
+    def remainder(a: OpVarT, b: OpVarT) -> OpVarT:
+        r = ops.mod(a, b)
+        cond = ops.and_(
+            ops.ne(r, ops.constant(0, torch.int32)),
+            ops.ne(ops.signbit(r), ops.signbit(b)),
+        )
+        return ops.where(cond, ops.add(r, b), r)
+    @staticmethod
+    def round_to_int(a: OpVarT, dtype: torch.dtype) -> OpVarT:
+        return ops.to_dtype(ops.round(a), dtype)
+_RE_PAREN_NOT_NEEDED = re.compile(r"[a-z0-9_.]+|\([^)]*\)|", flags=re.IGNORECASE)
+def _all_in_parens(string: str) -> bool:
+    if string[0] != "(" or len(string) < 2:
+        return False
+    count = 1
+    for i, char in enumerate(string[1:]):
+        if char == "(":
+            count += 1
+        elif char == ")":
+            count -= 1
+        if count == 0 and i != len(string) - 2:
+            return False
+    assert count == 0
+    return True
+class OpOverrides(BasicMathOpsMixin, OpDecompositions, OpsHandler[Any]):
+    @staticmethod
+    def paren(string: OpVarT) -> OpVarT:
+        if (
+            isinstance(string, CSEVariable)
+            or _RE_PAREN_NOT_NEEDED.fullmatch(string)
+            or _all_in_parens(string)
+        ):
+            # don't put extra parens for strings that are already wrapped in parens
+            return string
+        return f"({string})"
+    @staticmethod
+    def constant(value: Union[bool, float, int], dtype: torch.dtype) -> OpVarT:
+        return repr(value)
+    @staticmethod
+    def bitwise_not(x: OpVarT) -> OpVarT:
+        return f"~{OpOverrides.paren(x)}"
+    @staticmethod
+    def logical_not(a: OpVarT) -> OpVarT:
+        return f"{OpOverrides.paren(a)} == 0"
+    @staticmethod
+    def bitwise_and(x: OpVarT, y: OpVarT) -> OpVarT:
+        return f"{OpOverrides.paren(x)} & {OpOverrides.paren(y)}"
+    @staticmethod
+    def bitwise_or(x: OpVarT, y: OpVarT) -> OpVarT:
+        return f"{OpOverrides.paren(x)} | {OpOverrides.paren(y)}"
+    @staticmethod
+    def bitwise_xor(x: OpVarT, y: OpVarT) -> OpVarT:
+        return f"{OpOverrides.paren(x)} ^ {OpOverrides.paren(y)}"
+    @staticmethod
+    def bitwise_left_shift(x: OpVarT, y: OpVarT) -> OpVarT:
+        return f"{OpOverrides.paren(x)} << {OpOverrides.paren(y)}"
+    @staticmethod
+    def bitwise_right_shift(x: OpVarT, y: OpVarT) -> OpVarT:
+        return f"{OpOverrides.paren(x)} >> {OpOverrides.paren(y)}"
+    @staticmethod
+    def int_truediv(a: OpVarT, b: OpVarT) -> OpVarT:
+        # TODO: this is wrong
+        # TODO: an easy bandaid is to generate runtime asserts that it's
+        # <= 2**53, which is when this equation is correct
+        return ops.truediv(a, b)
+    @staticmethod
+    def load_seed(name: str, offset: OpVarT) -> OpVarT:
+        return ops.load(name, sympy.Integer(offset))
+    def indirect_indexing(
+        self,
+        var: OpVarT,
+        size: Union[sympy.Expr, int],
+        check: bool = True,
+        wrap_neg: bool = True,
+    ) -> sympy.Symbol:
+        return sympy_index_symbol(str(var))
+    def check_bounds(
+        self, expr: sympy.Expr, size: sympy.Expr, lower: bool, upper: bool
+    ) -> None:
+        raise NotImplementedError(
+            f"{type(self).__name__}: check_bounds should be handled by CSEProxy"
+        )
+    def load(self, name: str, index: sympy.Expr) -> OpVarT:
+        raise NotImplementedError(
+            f"{type(self).__name__}: load should be handled by CSEProxy"
+        )
+    def store(
+        self, name: str, index: sympy.Expr, value: OpVarT, mode: StoreMode = None
+    ) -> None:
+        raise NotImplementedError(
+            f"{type(self).__name__}: store should be handled by CSEProxy"
+        )
+    def store_reduction(self, name: str, index: sympy.Expr, value: OpVarT) -> None:
+        raise NotImplementedError(
+            f"{type(self).__name__}: store_reduction should be handled by CSEProxy"
+        )
+    def reduction(
+        self,
+        dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        reduction_type: ReductionType,
+        value: Union[OpVarT, tuple[OpVarT, ...]],
+    ) -> Union[OpVarT, tuple[OpVarT, ...]]:
+        raise NotImplementedError(
+            f"{type(self).__name__}: reduction should be handled by CSEProxy"
+        )
+    def scan(
+        self,
+        dtypes: tuple[torch.dtype, ...],
+        combine_fn: Callable[
+            [tuple[OpVarT, ...], tuple[OpVarT, ...]],
+            tuple[OpVarT, ...],
+        ],
+        values: tuple[OpVarT, ...],
+    ) -> tuple[OpVarT, ...]:
+        raise NotImplementedError(
+            f"{type(self).__name__}: scan should be handled by CSEProxy"
+        )
+    def sort(
+        self,
+        dtypes: tuple[torch.dtype, ...],
+        values: tuple[OpVarT, ...],
+        stable: bool,
+        descending: bool,
+    ) -> tuple[OpVarT, ...]:
+        raise NotImplementedError(
+            f"{type(self).__name__}: sort should be handled by CSEProxy"
+        )
+    def bucketize(
+        self,
+        values: OpVarT,
+        boundaries: tuple[str, sympy.Expr, sympy.Expr, sympy.Expr],
+        boundary_indices: OpVarT,
+        indexing_dtype: torch.dtype,
+        right: bool,
+        sorter: Optional[tuple[str, sympy.Expr]] = None,
+        sorter_indices: Optional[OpVarT] = None,
+    ) -> OpVarT:
+        raise NotImplementedError(
+            f"{type(self).__name__}: bucketize should be handled by CSEProxy"
+        )
+    def halide_clamp(self, value: OpVarT, size: sympy.Expr, check: bool) -> OpVarT:
+        raise NotImplementedError(
+            f"{type(self).__name__}: halide_clamp only implemented for Halide backend"
+        )
+    def inline_asm_elementwise(
+        self,
+        *inputs: OpVarT,
+        asm: str,
+        constraints: Optional[str] = None,
+        dtype: torch.dtype = torch.float32,
+        is_pure: bool = True,
+        pack: int = 1,
+    ) -> OpVarT:
+        raise NotImplementedError(
+            f"{type(self).__name__}: inline_asm_elementwise only implemented for Triton backend"
+        )
+    def output(self, *args: OpVarT) -> None:
+        raise AssertionError(
+            f"{type(self).__name__}: ops.output should not appear at codegen time"
+        )
+    def placeholder(self, index: int) -> OpVarT:
+        raise AssertionError(
+            f"{type(self).__name__}: ops.placeholder should not appear at codegen time"
+        )
+    @staticmethod
+    def _unimplemented(name: str) -> Callable[..., OpVarT]:
+        def unimplemented(self: OpOverrides, *args: Any, **kwargs: Any) -> OpVarT:
+            raise NotImplementedError(
+                f"{type(self).__name__} does not implement ops.{name}"
+            )
+        unimplemented.__name__ = name
+        unimplemented.is_unimplemented = True  # type: ignore[attr-defined]
+        return unimplemented
+    @classmethod
+    def _is_unimplemented(cls, name: str) -> bool:
+        fn = getattr(cls, name, None)
+        default_fn = getattr(OpsHandler, name, None)
+        return not fn or fn == default_fn or getattr(fn, "is_unimplemented", False)
+    @classmethod
+    def _initialize_pointwise_overrides(cls, target: str) -> None:
+        assert target in ("triton", "cpp", "cppvec", "halide", "mps"), target
+        for funcname, data in pointwise_overrides_data.items():
+            impl = getattr(data, target)
+            if impl is None:
+                if cls._is_unimplemented(funcname):
+                    setattr(cls, funcname, cls._unimplemented(funcname))
+            else:
+                assert funcname not in cls.__dict__, (
+                    f"multiple definitions of {funcname} on {cls.__name__}"
+                )
+                impl.__name__ = funcname
+                setattr(cls, funcname, staticmethod(impl))
+@dataclasses.dataclass
+class OverridesData:
+    name: str
+    cpp: Callable[..., str]
+    # None when not impl in libdevice/triton
+    triton: Optional[Callable[..., str]] = None
+    # None when not impl in aten/.../vec
+    cppvec: Optional[Callable[..., str]] = None
+    type_promotion_kind: ELEMENTWISE_TYPE_PROMOTION_KIND = (
+        ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    )
+    halide: Optional[Callable[..., str]] = None
+    mps: Optional[Callable[..., str]] = None
+# NB: if you add a new special function, don't forget to update
+# torch._inductor.ops_handler too
+pointwise_overrides_data: dict[str, OverridesData] = dict(
+    airy_ai=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"airy_ai_forward({x})",
+        name="special_airy_ai",
+    ),
+    bessel_j0=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"bessel_j0_forward({x})",
+        triton=lambda x: f"libdevice.j0({x})",
+        name="special_bessel_j0",
+    ),
+    bessel_j1=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"bessel_j1_forward({x})",
+        triton=lambda x: f"libdevice.j1({x})",
+        name="special_bessel_j1",
+    ),
+    bessel_y0=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"bessel_y0_forward({x})",
+        triton=lambda x: f"libdevice.y0({x})",
+        name="special_bessel_y0",
+    ),
+    bessel_y1=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"bessel_y1_forward({x})",
+        triton=lambda x: f"libdevice.y1({x})",
+        name="special_bessel_y1",
+    ),
+    digamma=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"calc_digamma({x})",
+        cppvec=lambda x: f"{x}.digamma()",
+        name="digamma",
+    ),
+    # no cpp nor triton implementation for entr, it is defined as decomposition
+    # erf, erfc
+    erfcx=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"calc_erfcx({x})",
+        triton=lambda x: f"libdevice.erfcx({x})",
+        name="special_erfcx",
+    ),
+    fma=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y, z: f"std::fma({x}, {y}, {z})",
+        cppvec=lambda x, y, z: f"fmadd({x}, {y}, {z})",
+        triton=lambda x, y, z: f"libdevice.fma({x}, {y}, {z})",
+        name="fma",
+    ),
+    # erfinv, exp2, expit, gammaln
+    igamma=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"calc_igamma({x}, {y})",
+        name="igamma",
+    ),
+    igammac=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"calc_igammac({x}, {y})",
+        name="igammac",
+    ),
+    gammainc=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"calc_igamma({x}, {y})",
+        name="special_gammainc",
+    ),
+    gammaincc=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"calc_igammac({x}, {y})",
+        name="special_gammaincc",
+    ),
+    i0=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"calc_i0({x})",
+        triton=lambda x: f"libdevice.cyl_bessel_i0({x})",
+        cppvec=lambda x: f"{x}.i0()",
+        name="i0",
+    ),
+    i0e=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"calc_i0e({x})",
+        cppvec=lambda x: f"{x}.i0e()",
+        name="special_i0e",
+    ),
+    i1=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"calc_i1({x})",
+        triton=lambda x: f"libdevice.cyl_bessel_i1({x})",
+        name="special_i1",
+    ),
+    i1e=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"calc_i1e({x})",
+        name="special_i1e",
+    ),
+    log_ndtr=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"calc_log_ndtr({x})",
+        name="special_log_ndtr",
+    ),
+    # logit
+    modified_bessel_i0=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"modified_bessel_i0_forward({x})",
+        triton=lambda x: f"libdevice.cyl_bessel_i0({x})",
+        name="special_modified_bessel_i0",
+    ),
+    modified_bessel_i1=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"modified_bessel_i1_forward({x})",
+        triton=lambda x: f"libdevice.cyl_bessel_i1({x})",
+        name="special_modified_bessel_i1",
+    ),
+    modified_bessel_k0=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"modified_bessel_k0_forward({x})",
+        name="special_modified_bessel_k0",
+    ),
+    modified_bessel_k1=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"modified_bessel_k1_forward({x})",
+        name="special_modified_bessel_k1",
+    ),
+    # multigamma
+    ndtr=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"calc_ndtr({x})",
+        name="special_ndtr",
+    ),
+    ndtri=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"calc_ndtri({x})",
+        name="special_ndtri",
+    ),
+    polygamma=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x,
+        y: f"{x} == 0 ? calc_digamma({y}) : ({x} == 1 ? trigamma({y}) : calc_polygamma({y}, {x}))",
+        name="polygamma",
+    ),
+    # psi - alias to digamma
+    # round
+    scaled_modified_bessel_k0=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"scaled_modified_bessel_k0_forward({x})",
+        name="special_scaled_modified_bessel_k0",
+    ),
+    scaled_modified_bessel_k1=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"scaled_modified_bessel_k1_forward({x})",
+        name="special_scaled_modified_bessel_k1",
+    ),
+    # sinc
+    spherical_bessel_j0=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"spherical_bessel_j0_forward({x})",
+        name="special_spherical_bessel_j0",
+    ),
+    zeta=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"zeta({x}, {y})",
+        name="special_zeta",
+    ),
+    chebyshev_polynomial_t=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"chebyshev_polynomial_t_forward({x}, {y})",
+        name="special_chebyshev_polynomial_t",
+    ),
+    chebyshev_polynomial_u=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"chebyshev_polynomial_u_forward({x}, {y})",
+        name="special_chebyshev_polynomial_u",
+    ),
+    chebyshev_polynomial_v=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"chebyshev_polynomial_v_forward({x}, {y})",
+        name="special_chebyshev_polynomial_v",
+    ),
+    chebyshev_polynomial_w=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"chebyshev_polynomial_w_forward({x}, {y})",
+        name="special_chebyshev_polynomial_w",
+    ),
+    legendre_polynomial_p=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"legendre_polynomial_p_forward({x}, {y})",
+        name="special_legendre_polynomial_p",
+    ),
+    shifted_chebyshev_polynomial_t=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"shifted_chebyshev_polynomial_t_forward({x}, {y})",
+        name="special_shifted_chebyshev_polynomial_t",
+    ),
+    shifted_chebyshev_polynomial_u=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"shifted_chebyshev_polynomial_u_forward({x}, {y})",
+        name="special_shifted_chebyshev_polynomial_u",
+    ),
+    shifted_chebyshev_polynomial_v=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"shifted_chebyshev_polynomial_v_forward({x}, {y})",
+        name="special_shifted_chebyshev_polynomial_v",
+    ),
+    shifted_chebyshev_polynomial_w=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"shifted_chebyshev_polynomial_w_forward({x}, {y})",
+        name="special_shifted_chebyshev_polynomial_w",
+    ),
+    hermite_polynomial_h=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"hermite_polynomial_h_forward({x}, {y})",
+        name="special_hermite_polynomial_h",
+    ),
+    hermite_polynomial_he=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"hermite_polynomial_he_forward({x}, {y})",
+        name="special_hermite_polynomial_he",
+    ),
+    laguerre_polynomial_l=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"laguerre_polynomial_l_forward({x}, {y})",
+        name="special_laguerre_polynomial_l",
+    ),
+)
+def is_buffer_removed(name: str) -> bool:
+    return any(
+        name in x
+        for x in (
+            V.graph.removed_buffers,
+            V.kernel.removed_buffers,
+            V.graph.inplaced_to_remove,
+            V.kernel.inplaced_to_remove,
+        )
+    )
+class DeferredLine(DeferredLineBase):
+    """A line that can be 'unwritten' by adding name to V.graph.removed_buffers"""
+    def __init__(self, name: str, line: str):
+        super().__init__(line)
+        self.name = name
+        assert not isinstance(line, DeferredLineBase)
+    def __call__(self) -> Optional[str]:
+        if not is_buffer_removed(self.name):
+            return self.line
+        return None
+    def _new_line(self, line: str) -> DeferredLine:
+        return DeferredLine(self.name, line)
+class BracesBuffer(IndentedBuffer):
+    def indent(self, offset: int = 1) -> contextlib.AbstractContextManager[None]:
+        @contextlib.contextmanager
+        def ctx() -> Iterator[None]:
+            for _ in range(offset):
+                self.writeline("{")
+                self._indent += 1
+            for _ in range(-offset):
+                self._indent -= 1
+                self.writeline("}")
+            yield
+            for _ in range(-offset):
+                self.writeline("{")
+                self._indent += 1
+            for _ in range(offset):
+                self._indent -= 1
+                self.writeline("}")
+        return ctx()
+class InplacedBuffer(NamedTuple):
+    inner_name: str
+    other_names: list[str]
+@dataclasses.dataclass
+class ArgName:
+    name: str
+    # is_constexpr=True is used to attach a " : tl.constexpr" into the argument list
+    is_constexpr: bool = False
+    def full_name(self) -> str:
+        return f"{self.name}{' : tl.constexpr' if self.is_constexpr else ''}"
+class RemovedArg:
+    def __str__(self) -> str:
+        return "REMOVED"
+REMOVED = RemovedArg()
+class KernelArgs:
+    @staticmethod
+    def _lookup(
+        prefix: str,
+        odict: Union[dict[_T, Union[str, RemovedArg]], dict[_T, str]],
+        name: _T,
+    ) -> str:
+        result: Union[str, RemovedArg] = odict.get(name, REMOVED)
+        if isinstance(result, RemovedArg):
+            odict[name] = new_result = f"{prefix}{len(odict)}"
+            return new_result
+        return result
+    def __init__(self) -> None:
+        self.input_buffers: dict[str, str] = {}
+        self.output_buffers: dict[str, Union[str, RemovedArg]] = {}
+        self.inplace_buffers: dict[str, Union[InplacedBuffer, RemovedArg]] = {}
+        self.sizevars: dict[sympy.Expr, str] = {}
+        self.workspace_args: list[WorkspaceArg] = []
+    def __repr__(self) -> str:
+        return "KernelArgs({})".format(
+            ", ".join(
+                map(
+                    repr,
+                    [
+                        self.input_buffers,
+                        self.output_buffers,
+                        self.inplace_buffers,
+                        self.sizevars,
+                    ],
+                )
+            )
+        )
+    @staticmethod
+    def _buffer_is_marked_removed(name: Any) -> bool:
+        # this function is needed by MTIA
+        return isinstance(name, RemovedArg)
+    def input(self, name: str) -> str:
+        if V.graph.scheduler:
+            name = V.graph.scheduler.mutation_real_name.get(name, name)
+        assert name not in V.graph.removed_buffers, name
+        if name in self.output_buffers:
+            return cast(str, self.output_buffers[name])
+        if name in self.inplace_buffers:
+            return cast(InplacedBuffer, self.inplace_buffers[name]).inner_name
+        if name.startswith("seed"):
+            return self._lookup("seed", self.input_buffers, name)
+        return self._lookup("in_ptr", self.input_buffers, name)
+    def output(self, name: str) -> str:
+        if V.graph.scheduler:
+            name = V.graph.scheduler.mutation_real_name.get(name, name)
+        assert name not in V.graph.removed_buffers, name
+        if name in self.inplace_buffers:
+            return cast(InplacedBuffer, self.inplace_buffers[name]).inner_name
+        return self._lookup("out_ptr", self.output_buffers, name)
+    def make_inplace(self, input_name: str, output_name: str) -> None:
+        if input_name in V.graph.unaligned_buffers:
+            V.graph.unaligned_buffers.add(output_name)
+        assert output_name not in self.inplace_buffers, output_name
+        if input_name in self.inplace_buffers:
+            buf = self.inplace_buffers[input_name]
+            assert not isinstance(buf, RemovedArg)
+            buf.other_names.append(output_name)
+            self.inplace_buffers[output_name] = buf
+        else:
+            alive_buffers = [
+                val
+                for val in self.inplace_buffers.values()
+                if not isinstance(val, RemovedArg)
+            ]
+            removed_buffers = [
+                val
+                for val in self.inplace_buffers.values()
+                if isinstance(val, RemovedArg)
+            ]
+            inplace_buffer_idx = len(unique(alive_buffers)) + len(removed_buffers)
+            buf = InplacedBuffer(
+                f"in_out_ptr{inplace_buffer_idx}",
+                [input_name, output_name],
+            )
+            self.inplace_buffers[input_name] = buf
+            self.inplace_buffers[output_name] = buf
+    def workspace(self, nbytes: sympy.Expr, zero_fill: bool) -> tuple[str, int]:
+        """
+        Allocate or extend a workspace buffer of nbytes bytes.
+        This function manages the allocation of a workspace buffer. It either creates
+        a new WorkspaceArg or extends an existing one.
+        Note:
+        - Calling this function will in-place mutate the args by adding or updating
+        a WorkspaceArg.
+        - The codegen for generating the Python argdefs and call_defs will check
+        this field and allocate the buffer accordingly.
+        - A new argument "ws_ptr" will be present in the generated code.
+        Args:
+            nbytes (sympy.Expr): The number of bytes to allocate.
+            zero_fill (bool): Whether to initialize the buffer to zero.
+        Returns:
+            Tuple[str, int]: A tuple containing:
+                - "ws_ptr": A string identifier for the workspace pointer.
+                - offset: An integer representing the byte offset in the workspace.
+        """
+        arg = WorkspaceArg(
+            count=nbytes,
+            zero_mode=WorkspaceZeroMode.from_bool(zero_fill),
+            device=V.graph.get_current_device_or_throw(),
+            outer_name=WorkspaceArg.unique_name(),
+        )
+        for i, existing_arg in enumerate(self.workspace_args):
+            if WorkspaceArg.can_join(existing_arg, arg):
+                offset = existing_arg.count
+                self.workspace_args[i] = WorkspaceArg.join(existing_arg, arg)
+                return existing_arg.inner_name, offset
+            assert (
+                existing_arg.inner_name != arg.inner_name
+                and existing_arg.outer_name != arg.outer_name
+            ), existing_arg
+        self.workspace_args.append(arg)
+        return arg.inner_name, 0
+    def semaphores(self, min_size: sympy.Expr) -> str:
+        """
+        Lazily allocate a graph-wide semaphores buffer with at least min_size.  This is a single buffer shared by
+        all kernels and zero initialized once at graph start.  Each kernel must leave the buffer zeroed on exit.
+        Warning: multiple calls to this function will return the same buffer.
+        Args:
+            min_size: the number of int32 semaphores required
+        Returns:
+            name of the semaphores buffer
+        """
+        current_device = V.graph.get_current_device_or_throw()
+        arg = WorkspaceArg(
+            count=min_size,
+            zero_mode=WorkspaceZeroMode.ZERO_PER_GRAPH,
+            dtype=torch.uint32,
+            inner_name="sem_ptr",
+            outer_name=f"semaphores_{current_device.type}_{current_device.index}",
+            device=current_device,
+        )
+        for existing_arg in self.workspace_args:
+            if existing_arg.inner_name == arg.inner_name:
+                assert arg == existing_arg, (arg, existing_arg)
+        self.workspace_args.append(arg)
+        return arg.inner_name
+    def seed_offset(self, name: str, value: int) -> str:
+        assert isinstance(value, int), (type(value), value)
+        # here we are lifting a constant integer into an arg to the kernel to try to get additional cache hits
+        value = sympy.Integer(value)
+        if value in self.sizevars:
+            return self.sizevars[value]
+        if name in self.sizevars.values():
+            name = (
+                f"{name}{sum(1 for v in self.sizevars.values() if v.startswith(name))}"
+            )
+        self.sizevars[value] = name
+        return name
+    def size(self, name: sympy.Symbol) -> str:
+        assert isinstance(name, sympy.Symbol), (type(name), name)
+        if name.name == "seed":
+            self.sizevars[name] = "seed"  # don't manage the name of seeds
+            return "seed"
+        return self._lookup("ks", self.sizevars, name)
+    def call_names(self) -> Iterator[str]:
+        return chain(
+            self.input_buffers.keys(), self.output_buffers.keys(), self.sizevars.keys()
+        )
+    def arg_name(self, name: str) -> Optional[str]:
+        """
+        Returns inner name of a given outer name.
+        """
+        inplaced = self.inplace_buffers.get(name, None)
+        if inplaced is not None and not isinstance(inplaced, RemovedArg):
+            return inplaced.inner_name
+        output_name = self.output_buffers.get(name, None)
+        if output_name is not None and not isinstance(output_name, RemovedArg):
+            return output_name
+        return self.input_buffers.get(name, None)
+    def wrap_ptr_arg(self, buf: str, dtype: torch.dtype) -> str:
+        return buf
+    def wrap_size_arg(self, size: SymbolLike) -> str:
+        return str(size)
+    def cpp_argdefs(
+        self, dtype_to_cpp_type: Optional[dict[torch.dtype, str]] = None
+    ) -> tuple[list[str], list[str], list[str]]:
+        from .cpp_utils import INDEX_TYPE
+        if dtype_to_cpp_type is None:
+            from .cpp_utils import DTYPE_TO_CPP
+            dtype_to_cpp_type = DTYPE_TO_CPP
+        call_args = []
+        arg_defs = []
+        arg_types = []
+        for inplaced in unique(self.inplace_buffers.values()):
+            if isinstance(inplaced, RemovedArg):
+                continue
+            outer = inplaced.other_names[-1]
+            inner = inplaced.inner_name
+            dtype = V.graph.get_dtype(outer)
+            cpp_dtype = dtype_to_cpp_type[dtype]
+            arg_defs.append(f"{cpp_dtype}* {inner}")
+            call_args.append(self.wrap_ptr_arg(outer, dtype))
+            arg_types.append(f"{cpp_dtype}*")
+        for outer, inner in self.input_buffers.items():
+            if outer in self.inplace_buffers:
+                continue
+            dtype = V.graph.get_dtype(outer)
+            cpp_dtype = dtype_to_cpp_type[dtype]
+            arg_defs.append(f"const {cpp_dtype}* {inner}")
+            call_args.append(self.wrap_ptr_arg(outer, dtype))
+            arg_types.append(f"const {cpp_dtype}*")
+        for outer, maybe_inner in self.output_buffers.items():
+            if outer in self.inplace_buffers or isinstance(maybe_inner, RemovedArg):
+                continue
+            dtype = V.graph.get_dtype(outer)
+            cpp_dtype = dtype_to_cpp_type[dtype]
+            arg_defs.append(f"{cpp_dtype}* {maybe_inner}")
+            call_args.append(self.wrap_ptr_arg(outer, dtype))
+            arg_types.append(f"{cpp_dtype}*")
+        for outer, inner in self.sizevars.items():
+            arg_defs.append(f"const {INDEX_TYPE} {inner}")
+            call_args.append(self.wrap_size_arg(outer))
+            arg_types.append(f"const {INDEX_TYPE}")
+            if V.graph.wrapper_code:
+                V.graph.wrapper_code.ensure_size_computed(outer)
+        assert not self.workspace_args, "Workspace not supported on CPU "
+        return arg_defs, call_args, arg_types
+    def python_argdefs(
+        self,
+    ) -> tuple[list[ArgName], list[str], list[KernelArgType], list[Any]]:
+        arg_defs: list[ArgName] = []
+        call_args: list[str] = []
+        arg_types: list[Any] = []
+        precompile_args: list[KernelArgType] = []
+        for inplaced in unique(self.inplace_buffers.values()):
+            if isinstance(inplaced, RemovedArg):
+                continue
+            arg_defs.append(ArgName(inplaced.inner_name))
+            call_args.append(inplaced.other_names[-1])
+            arg_types.append(V.graph.get_dtype(inplaced.other_names[-1]))
+            precompile_args.append(
+                TensorArg(
+                    name=inplaced.inner_name,
+                    buffer=inplaced.other_names[-1],
+                    dtype=V.graph.get_dtype(inplaced.other_names[-1]),
+                )
+            )
+        for outer, inner in chain(
+            self.input_buffers.items(), self.output_buffers.items()
+        ):
+            if outer in self.inplace_buffers or isinstance(inner, RemovedArg):
+                continue
+            arg_defs.append(ArgName(inner))
+            call_args.append(outer)
+            arg_types.append(V.graph.get_dtype(outer))
+            precompile_args.append(
+                TensorArg(
+                    name=inner,
+                    buffer=outer,
+                    dtype=V.graph.get_dtype(outer),
+                )
+            )
+        for outer, inner in self.sizevars.items():
+            arg_defs.append(ArgName(inner))
+            call_args.append(outer)
+            arg_types.append(type(outer))
+            precompile_args.append(SizeArg(inner, outer))
+            if V.graph.wrapper_code:
+                V.graph.wrapper_code.ensure_size_computed(outer)
+        for arg in self.workspace_args:
+            arg_defs.append(ArgName(arg.inner_name))
+            call_args.append(arg.outer_name)
+            precompile_args.append(arg)
+            arg_types.append(arg.dtype)
+        return arg_defs, call_args, precompile_args, arg_types
+    def aliases(self) -> Iterator[tuple[str, str]]:
+        for inplaced in unique(self.inplace_buffers.values()):
+            if isinstance(inplaced, RemovedArg):
+                continue
+            for other in inplaced.other_names:
+                if (
+                    other in V.graph.inplaced_to_remove
+                    or other in V.kernel.inplaced_to_remove
+                ):
+                    continue
+                if other in self.input_buffers:
+                    yield self.input_buffers[other], inplaced.inner_name
+                if other in self.output_buffers:
+                    yield cast(str, self.output_buffers[other]), inplaced.inner_name
+    def is_removed(self, name: str) -> bool:
+        return isinstance(
+            self.output_buffers.get(name, REMOVED), RemovedArg
+        ) and isinstance(self.inplace_buffers.get(name, REMOVED), RemovedArg)
+    # Includes inplace buffers, excludes removed buffers.  Essentially,
+    # after you do a call into this kernel, which buffers actually contain
+    # updated data?  Modeled off of python_argdefs.
+    def live_output_buffers(self) -> OrderedSet[str]:
+        live_outs: OrderedSet[str] = OrderedSet()
+        for inplaced in unique(self.inplace_buffers.values()):
+            if isinstance(inplaced, RemovedArg):
+                continue
+            live_outs.add(inplaced.other_names[-1])
+        for outer, inner in self.output_buffers.items():
+            if outer in self.inplace_buffers or isinstance(inner, RemovedArg):
+                continue
+            live_outs.add(outer)
+        return live_outs
+class CSEVariable:
+    """A CSEVariable is just a name for an expression but it is useful to be able to annotate them on a backend dependent basis.
+    To do so, the backends can simply overload `Kernel.create_cse_var`
+    The "CSEVariable.update_on_args" method gives you a hook for annotations
+    See example of TritonCSEVariable in triton.py
+    """
+    def __init__(
+        self,
+        name: str,
+        bounds: ValueRanges[Any],
+        dtype: Optional[torch.dtype] = None,
+    ):
+        super().__init__()
+        assert isinstance(bounds, ValueRanges), type(bounds)
+        self.name = name
+        self.bounds = bounds
+        self.use_count = 1  # track how many times this expression is used
+        self.dtype = dtype
+    def __str__(self) -> str:
+        return self.name
+    def __hash__(self) -> int:
+        return hash(self.name)
+    def __eq__(self, other: object) -> bool:
+        return isinstance(other, CSEVariable) and other.name == self.name
+    def update_on_args(self, name: str, args: Any, kwargs: Any) -> None:
+        pass
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.name!r})"
+AugmentedKeyT = TypeVar("AugmentedKeyT", default=str)
+CSEVariableType = TypeVar("CSEVariableType", bound=CSEVariable, default=CSEVariable)
+if TYPE_CHECKING:
+    ReductionCacheKey = tuple[
+        torch.dtype,
+        ReductionType,
+        Union[CSEVariable, tuple[CSEVariable, ...]],
+    ]
+class CSE(Generic[CSEVariableType, AugmentedKeyT]):
+    """Common subexpression elimination"""
+    def __init__(
+        self,
+        prefix: str = "",
+        suffix: str = "",
+        name_prefix: str = "tmp",
+        iter_buffers: Optional[itertools.count[int]] = None,
+        store_cache: Optional[MutableMapping[str, CSEVariableType]] = None,
+        reduction_cache: Optional[
+            MutableMapping[ReductionCacheKey, CSEVariableType]
+        ] = None,
+        varname_map: Optional[dict[str, CSEVariableType]] = None,
+    ):
+        self.prefix = prefix
+        self.suffix = suffix
+        self._cache: MutableMapping[AugmentedKeyT, CSEVariableType] = {}
+        self.name_prefix = name_prefix
+        self.store_cache: MutableMapping[str, CSEVariableType] = store_cache or {}
+        self.reduction_cache: MutableMapping[ReductionCacheKey, CSEVariableType] = (
+            reduction_cache or {}
+        )
+        self.iter_buffer_ids: itertools.count[int] = iter_buffers or itertools.count()
+        self.invalidated_stores: OrderedSet[str] = OrderedSet()
+        self.varname_map: dict[str, CSEVariableType] = varname_map or {}
+    def invalidate(self, keep_vars: OrderedSet[CSEVariable]) -> None:
+        for name, tmp in [*self.store_cache.items()]:
+            if tmp not in keep_vars:
+                del self.store_cache[name]
+                self.invalidated_stores.add(name)
+        if keep_vars:
+            self._cache = {k: v for k, v in self._cache.items() if v in keep_vars}
+        else:
+            self._cache = {}
+    def clone(self) -> Self:
+        return type(self)(
+            prefix=self.prefix,
+            suffix=self.suffix,
+            name_prefix=self.name_prefix,
+            iter_buffers=self.iter_buffer_ids,
+            store_cache=self.store_cache,
+            varname_map=self.varname_map,
+            reduction_cache=self.reduction_cache,
+        )
+    def scoped_copy(self) -> Self:
+        """Return a copy of using ScopedDict so changes to *_cache aren't visible in self"""
+        new_cse = self.clone()
+        new_cse._cache = ScopedDict(self._cache)
+        new_cse.reduction_cache = ScopedDict(self.reduction_cache)
+        new_cse.store_cache = ScopedDict(self.store_cache)
+        return new_cse
+    def augment_key(self, cache_key: str) -> AugmentedKeyT:
+        "Override this method to augment cache key with backend specifics"
+        return cast(AugmentedKeyT, cache_key)
+    def put(self, cache_key: str, val: CSEVariableType) -> None:
+        self._cache[self.augment_key(cache_key)] = val
+    def contains(self, cache_key: str) -> bool:
+        return self.augment_key(cache_key) in self._cache
+    def try_get(self, cache_key: str) -> Optional[CSEVariableType]:
+        return self._cache.get(self.augment_key(cache_key), None)
+    def get(self, cache_key: str) -> CSEVariableType:
+        return self._cache[self.augment_key(cache_key)]
+    def generate(
+        self,
+        buffer: IndentedBuffer,
+        expr: Union[str, CSEVariable, OpsValue, IndentedBuffer, DeferredLineBase],
+        *,
+        bounds: ValueRanges[Any] = ValueRanges.unknown(),
+        write: bool = True,
+        assignment: bool = True,
+        dtype: Optional[torch.dtype] = None,
+    ) -> CSEVariableType:
+        if isinstance(expr, OpsValue):
+            expr = expr.value
+        assert write or assignment
+        if isinstance(expr, CSEVariable):
+            # If the expressions were always created with all the information, we could
+            # assert expr.bounds == bounds, but sometimes the expression is created
+            # with the loose ValueRanges.unknown(), so we need to tighten the bounds
+            expr.bounds = expr.bounds.tighten(bounds)
+            expr.use_count += 1
+            return cast(CSEVariableType, expr)
+        elif isinstance(expr, IndentedBuffer):
+            cache_key = expr.getvalue()
+        elif isinstance(expr, DeferredLineBase):
+            cache_key = expr.line
+        else:
+            assert isinstance(expr, str)
+            cache_key = expr
+        var = self.try_get(cache_key)
+        if not var:
+            var = self.newvar(bounds, dtype)
+            self.put(cache_key, var)
+            if write:
+                if V.kernel.current_node:
+                    V.kernel.current_node.codegen_originating_info(
+                        buffer, only_once=True
+                    )
+                if isinstance(expr, IndentedBuffer):
+                    if assignment:
+                        buffer.writeline(f"{self.prefix}{var} =")
+                    buffer.splice(expr)
+                    buffer.writeline(self.suffix)
+                elif isinstance(expr, DeferredLineBase):
+                    assert assignment
+                    buffer.writeline(
+                        expr._new_line(f"{self.prefix}{var} = {expr.line}{self.suffix}")
+                    )
+                else:
+                    if assignment:
+                        line = f"{self.prefix}{var} = {expr}{self.suffix}"
+                    else:
+                        line = f"{expr}{self.suffix}"
+                    buffer.writeline(line)
+                    # cpp backend cannot determine is_vec at this point
+                    if (
+                        assignment
+                        and (
+                            config.test_configs.runtime_triton_dtype_assert
+                            or config.test_configs.static_cpp_dtype_assert
+                        )
+                        and dtype is not None
+                        and get_current_backend() != "cpp"
+                    ):
+                        check_dtype(buffer, var, dtype)
+        else:
+            var.bounds = var.bounds.tighten(bounds)
+            var.use_count += 1
+        return var
+    def newvar(
+        self,
+        bounds: ValueRanges[Any] = ValueRanges.unknown(),
+        dtype: Optional[torch.dtype] = None,
+    ) -> CSEVariableType:
+        var_name = f"{self.name_prefix}{next(self.iter_buffer_ids)}"
+        var = V.kernel.create_cse_var(var_name, bounds, dtype)
+        self.varname_map[var_name] = var
+        return var
+    def namedvar(
+        self,
+        name: str,
+        bounds: ValueRanges[Any] = ValueRanges.unknown(),
+        dtype: Optional[torch.dtype] = None,
+    ) -> CSEVariableType:
+        torch._check_value(
+            name not in self.varname_map, lambda: f"duplicate name: {name}"
+        )
+        var = V.kernel.create_cse_var(name, bounds, dtype)
+        self.varname_map[name] = var
+        return var
+class CodeGen:
+    def __init__(self) -> None:
+        super().__init__()
+        self.exit_stack = contextlib.ExitStack()
+    def __enter__(self) -> Self:
+        self.exit_stack.__enter__()
+        return self
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
+        self.exit_stack.__exit__(exc_type, exc_val, exc_tb)
+class Kernel(CodeGen, Generic[CSEVariableType]):
+    newvar_prefix: str = ""
+    suffix: str = ""
+    overrides: Optional[Callable[[], OpsHandler[Any]]] = None
+    def __init__(
+        self, args: Optional[KernelArgs] = None, increase_kernel_count: bool = True
+    ) -> None:
+        super().__init__()
+        if increase_kernel_count:
+            metrics.generated_kernel_count += 1
+        self.args = args or KernelArgs()
+        self.loads = IndentedBuffer()
+        self.compute = IndentedBuffer()
+        self.stores = IndentedBuffer()
+        self.num_load = 0
+        self.num_reduction = 0
+        self.cse: CSE[CSEVariableType, Any] = CSE(self.newvar_prefix, self.suffix)
+        self.must_keep_buffers: OrderedSet[str] = OrderedSet()
+        self.store_buffer_names: OrderedSet[str] = OrderedSet()
+        self._load_mask: Optional[str] = None
+        self._load_other: Union[None, int, float] = None
+        # OrderedSet in set_current_node
+        self.current_node: Optional[SchedulerNode] = None
+        self.node_to_bounds: Optional[dict[torch.fx.Node, ValueRanges[Any]]] = None
+        self.removed_buffers: OrderedSet[str] = OrderedSet()
+        self.inplaced_to_remove: OrderedSet[str] = OrderedSet()
+        # key: the buffer to write
+        # value: the buffer to read and whose memory can be reused for
+        #   the buffer specified by key
+        self.inplace_update_buffers: dict[str, str] = {}
+        # Set minimum number of elements processed per thread.
+        self.min_elem_per_thread = 1
+        self.kernel_name: Optional[str] = None
+    @contextlib.contextmanager
+    def set_current_node(self, node: SchedulerNode) -> Iterator[None]:
+        prior = self.current_node
+        self.current_node = node
+        self.node_to_bounds = node._body.bounds().get_bounds()
+        try:
+            yield
+        finally:
+            self.current_node = prior
+    @contextlib.contextmanager
+    def swap_buffers(
+        self,
+        lb: IndentedBuffer,
+        cb: Optional[IndentedBuffer] = None,
+        sb: Optional[IndentedBuffer] = None,
+    ) -> Iterator[None]:
+        if cb is None:
+            cb = lb
+        if disallow_stores := sb is None:
+            sb = IndentedBuffer()
+        loads = self.loads
+        compute = self.compute
+        stores = self.stores
+        cse = self.cse
+        self.loads = lb
+        self.compute = cb
+        self.stores = sb
+        self.cse = cse.scoped_copy()
+        try:
+            yield
+        finally:
+            self.loads = loads
+            self.compute = compute
+            self.stores = stores
+            self.cse = cse
+            if disallow_stores:
+                assert not sb, "unexpected store inside swap_buffers"
+    def load(self, name: str, index: sympy.Expr) -> CSEVariable:
+        raise NotImplementedError
+    def indirect_load(self, name: str, index: sympy.Expr) -> CSEVariable:
+        """A load the depends on an index we have read"""
+        prior = self.loads
+        try:
+            # put the load in the compute section as it might have deps
+            self.loads = self.compute
+            return self.load(name, index)
+        finally:
+            self.loads = prior
+    def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable) -> None:
+        raise NotImplementedError
+    def store(
+        self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None
+    ) -> None:
+        raise NotImplementedError
+    def reduction(
+        self,
+        dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        reduction_type: ReductionType,
+        value: Union[CSEVariable, tuple[CSEVariable, ...]],
+    ) -> Union[CSEVariable, tuple[CSEVariable, ...]]:
+        raise NotImplementedError
+    def scan(
+        self,
+        dtypes: tuple[torch.dtype, ...],
+        combine_fn: Callable[
+            [tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]
+        ],
+        values: tuple[CSEVariable, ...],
+    ) -> tuple[CSEVariable, ...]:
+        raise NotImplementedError
+    def sort(
+        self,
+        dtypes: tuple[torch.dtype, ...],
+        values: tuple[CSEVariable, ...],
+        stable: bool,
+        descending: bool,
+    ) -> tuple[CSEVariable, ...]:
+        raise NotImplementedError
+    def var_ranges(self) -> dict[sympy.Symbol, sympy.Expr]:
+        raise NotImplementedError
+    def bucketize(
+        self,
+        values: CSEVariable,
+        boundaries: tuple[str, sympy.Expr, sympy.Expr, sympy.Expr],
+        boundary_indices: CSEVariable,
+        indexing_dtype: torch.dtype,
+        right: bool,
+        sorter: Optional[tuple[str, sympy.Expr]] = None,
+        sorter_indices: Optional[CSEVariable] = None,
+    ) -> CSEVariable:
+        """
+        See [Note: Inductor bucketize op]
+        """
+        raise NotImplementedError
+    @property
+    def assert_function(self) -> str:
+        raise NotImplementedError
+    def indirect_assert(
+        self,
+        var: Union[CSEVariable, str],
+        lower: Optional[str],
+        upper: Optional[str],
+        mask: Optional[Union[CSEVariable, str]] = None,
+    ) -> str:
+        if isinstance(var, CSEVariable):
+            var = str(var)
+        assert isinstance(var, str), type(var)
+        assert lower is None or isinstance(lower, str)
+        assert upper is None or isinstance(upper, str)
+        if lower and upper:
+            # The conditions need to be in parens because of Python's operator precedence.
+            # It'd be less error-prone to use and/or/not, which is supported by triton
+            cond = f"({lower} <= {var}) & ({var} < {upper})"
+            cond_print = f"{lower} <= {var} < {upper}"
+        elif lower:
+            cond = f"{lower} <= {var}"
+            cond_print = cond
+        else:
+            assert upper
+            cond = f"{var} < {upper}"
+            cond_print = cond
+        if mask:
+            cond = f"({cond}) | ~({mask})"
+        return f'{self.assert_function}({cond}, "index out of bounds: {cond_print}")'
+    def check_bounds(
+        self, expr: sympy.Expr, size: sympy.Expr, lower: bool, upper: bool
+    ) -> None:
+        raise NotImplementedError
+    def index_to_str(self, index: sympy.Expr) -> str:
+        raise NotImplementedError
+    def __enter__(self) -> Self:
+        super().__enter__()
+        assert self.overrides
+        self.exit_stack.enter_context(
+            V.set_ops_handler(CSEProxy(self, self.overrides()))
+        )
+        self.exit_stack.enter_context(V.set_kernel_handler(self))
+        return self
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
+        self.remove_kernel_local_buffers()
+        super().__exit__(exc_type, exc_val, exc_tb)
+    def remove_kernel_local_buffers(self) -> None:
+        """
+        Any buffers that are both created and have a last use in the
+        same kernel can be removed.
+        Note that V.graph.scheduler can be None when codegening triton template
+        kernels.
+        """
+        scheduler = V.graph.scheduler
+        if not scheduler:
+            return
+        fused_node_names = OrderedSet(
+            scheduler.name_to_buf[buf].defining_op_name()
+            for buf in self.store_buffer_names
+            if buf in scheduler.name_to_buf
+        )
+        names_to_remove: OrderedSet[str] = OrderedSet()
+        for name in self.store_buffer_names:
+            if (
+                name not in self.must_keep_buffers
+                and name not in self.args.input_buffers
+                and scheduler.can_buffer_be_removed_through_fusion(
+                    name, fused_node_names
+                )
+            ):
+                names_to_remove.add(name)
+        for name in names_to_remove:
+            if name in self.args.inplace_buffers:
+                buf = self.args.inplace_buffers[name]
+                if isinstance(buf, RemovedArg):
+                    continue
+                remove = all(n in names_to_remove for n in buf.other_names)
+                if remove:
+                    self.remove_inplace_buffer(name)
+                self.inplaced_to_remove.add(name)
+            else:
+                self.remove_buffer(name)
+    def remove_buffer(self, name: str) -> None:
+        # Assign a special value instead of deleting the entry
+        # because we still rely on output_buffers's length to
+        # generate unique arg name.
+        log.debug("remove_buffer(%r)", name)
+        self.args.output_buffers[name] = REMOVED
+        self.removed_buffers.add(name)
+    def remove_inplace_buffer(self, name: str) -> None:
+        log.debug("removing_inplace_buffer(%r)", name)
+        self.args.inplace_buffers[name] = REMOVED
+        self.removed_buffers.add(name)
+    def rename_indexing(
+        self, index: Union[list[sympy.Expr], tuple[sympy.Expr, ...], sympy.Expr]
+    ) -> sympy.Expr:
+        # adds the necessary kernel args for index expressions
+        # and renames variables in index expressions to kernel arg names
+        if isinstance(index, (list, tuple)):
+            return [self.rename_indexing(x) for x in index]
+        index = V.graph.sizevars.simplify(index)
+        sorted_symbols = sorted(index.free_symbols, key=lambda s: s.name)
+        replacements = {
+            x: self.args.size(x)
+            for x in sorted_symbols
+            if symbol_is_type(
+                x,
+                (
+                    SymT.UNBACKED_INT,
+                    SymT.SIZE,
+                    SymT.PRECOMPUTED_SIZE,
+                ),
+            )
+        }
+        return sympy_subs(index, replacements)
+    def create_cse_var(self, *args: Any, **kwargs: Any) -> CSEVariable:
+        return CSEVariable(*args, **kwargs)
+    def arg_name(self, node: IRNode) -> Optional[str]:
+        """
+        Returns arg name of a given input or output node.
+        """
+        if node is None:
+            return None
+        return self.args.arg_name(node.get_name())
+@dataclasses.dataclass
+class OptimizationContext:
+    key: ClassVar[str] = "opt_ctx"
+    dtype: Optional[torch.dtype] = None
+    ops_name: str = ""
+@functools.cache
+def jinja2_env() -> Any:
+    try:
+        import jinja2
+        return jinja2.Environment(
+            undefined=jinja2.StrictUndefined,
+        )
+    except ImportError:
+        return None
+class KernelTemplate:
+    """
+    Base class for defining kernel templates.
+    Children classes: TritonTemplate, CUDATemplate
+    """
+    @staticmethod
+    def indent_except_first(
+        source: str, num_indents: int, indents_spacing: int = 4
+    ) -> str:
+        lines = source.splitlines(True)
+        if len(lines) > 1:
+            lines[1:] = [
+                (" " * indents_spacing * num_indents) + line for line in lines[1:]
+            ]
+        return "".join(lines)
+    @staticmethod
+    def _template_from_string(source: str) -> Any:
+        env = jinja2_env()
+        if env is None:
+            return None
+        env.filters["indent_except_first"] = KernelTemplate.indent_except_first
+        from jinja2 import TemplateSyntaxError
+        try:
+            return env.from_string(source)
+        except TemplateSyntaxError as e:
+            class DetailedTemplateSyntaxError(TemplateSyntaxError):
+                def __init__(self, original_error: TemplateSyntaxError) -> None:
+                    super().__init__(
+                        original_error.message,
+                        original_error.lineno,
+                        original_error.name,
+                        original_error.filename,
+                    )
+                    self.original_error = original_error
+                def __str__(self) -> str:
+                    error_info = f"Error in template at line {self.lineno}\n"
+                    error_info += f"Error message: {self.message}\n"
+                    if hasattr(self.original_error, "source"):
+                        lines = self.original_error.source.split("\n")
+                        error_info += "Context:\n"
+                        start = max(0, self.lineno - 2)
+                        end = min(len(lines), self.lineno + 2)
+                        for i in range(start, end):
+                            if i == self.lineno - 1:
+                                error_info += f"{i + 1}: --> {lines[i]}\n"
+                                if hasattr(self.original_error, "column"):
+                                    error_info += (
+                                        "     "
+                                        + " " * (self.original_error.column - 1)
+                                        + "^\n"
+                                    )
+                            else:
+                                error_info += f"{i + 1}:     {lines[i]}\n"
+                    return error_info
+            raise DetailedTemplateSyntaxError(e) from e
+    @staticmethod
+    def _fake_get_dtype(
+        fake_outs: Union[list[Buffer], Buffer],
+    ) -> Callable[[str], torch.dtype]:
+        _get_dtype_real = V.graph.get_dtype
+        if isinstance(fake_outs, (list, tuple)):
+            lookup = {buf.get_name(): buf.get_dtype() for buf in fake_outs}
+        else:
+            lookup = {fake_outs.get_name(): fake_outs.get_dtype()}
+        def get_dtype(name: str) -> torch.dtype:
+            result = lookup.get(name)
+            if result is not None:
+                return result
+            return _get_dtype_real(name)
+        return get_dtype
+    def __init__(self, name: str) -> None:
+        self.name = name
+    def maybe_append_choice(
+        self, choices: list[Any], **kwargs: Any
+    ) -> Optional[NotImplementedError]:
+        """
+        Maybe generates a new ChoiceCaller and appends it into existing choices.
+        Returns None if success, otherwise returns the error.
+        choices: A list of ChoiceCallers.
+        kwargs: Additional kwargs to be passed to self.generate() to generate a new ChoiceCaller.
+        """
+        try:
+            choices.append(self.generate(**kwargs))
+            return None
+        except NotImplementedError as e:
+            log.info(
+                "Cannot Append Choice: %s. KernelTemplate type is %s",
+                e,
+                type(self),
+                stack_info=log.getEffectiveLevel() < logging.INFO,
+            )
+            return e
+    def generate(self, **kwargs: Any) -> ChoiceCaller:
+        """
+        Generates a ChoiceCaller instance from the given arguments.
+        """
+        raise NotImplementedError
+class CSEProxy(DefaultHandler):
+    name = "CSEProxy"
+    def __init__(self, kernel: Kernel[Any], parent_handler: OpsHandler[Any]):
+        super().__init__()
+        from ..bounds import ValueRangeAnalysis
+        self.vr_analysis = ValueRangeAnalysis()
+        self.kernel = kernel
+        self.parent_handler = parent_handler
+    def _default(self, name: str, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
+        bounds = self._bound_variable(name, *args, **kwargs)
+        value = getattr(self.parent_handler, name)(*args, **kwargs)
+        dtype_handler = DtypePropagationOpsHandler()
+        backend = get_current_backend()
+        output_dtype = None
+        if name == "masked" and backend == "triton":
+            output_dtype = value.dtype
+        elif name == "masked" and backend == "cpp":
+            output_dtype = V.interpreter.current_node.meta.get(
+                OptimizationContext.key, None
+            ).dtype
+        elif backend in ("triton", "cpp", "mps"):
+            dtype_op = getattr(dtype_handler, name)
+            output_dtype = dtype_op(*args, **kwargs)
+        if backend in ("triton", "cpp"):
+            # maybe there are some exceptions on mps?
+            assert output_dtype is not None
+        output_idx = 0
+        def do_cse(v: str) -> CSEVariable:
+            # we tree_map over the output, so we need to fetch corresponding dtype
+            nonlocal output_idx
+            var_dtype: Optional[torch.dtype] = (
+                output_dtype[output_idx]
+                if isinstance(output_dtype, (list, tuple))
+                else output_dtype
+            )
+            output_idx += 1
+            # some cpp op implementations don't set the dtype
+            if backend == "cpp" and isinstance(v, CSEVariable) and v.dtype is None:
+                v.dtype = var_dtype
+            csevar = V.kernel.cse.generate(
+                V.kernel.compute,
+                v,
+                bounds=bounds,
+                dtype=output_dtype,
+            )
+            csevar.update_on_args(name, args, kwargs)
+            if (
+                config.test_configs.runtime_triton_dtype_assert
+                or config.test_configs.static_cpp_dtype_assert
+            ):
+                assert var_dtype is not None
+                check_dtype(V.kernel.compute, csevar, var_dtype)
+            return csevar
+        return pytree.tree_map(do_cse, value)
+    def _bound_variable(self, name: str, *args: Any, **kwargs: Any) -> ValueRanges[Any]:
+        """
+        If the variable comes from an FX node, we forward the bound we have already computed
+        Else, if the variable when codegen'ing another op, we try to compute its bounds
+        """
+        from ..bounds import ValueRangeAnalysis
+        from ..select_algorithm import TritonTemplateKernel
+        from .cuda.cuda_kernel import CUDATemplateKernel
+        if isinstance(V.kernel, TritonTemplateKernel):
+            return ValueRanges.unknown()
+        if isinstance(V.kernel, CUDATemplateKernel):
+            return ValueRanges.unknown()
+        fx_node = V.interpreter.current_node
+        if fx_node.target == name and self.kernel.node_to_bounds is not None:
+            assert isinstance(self.kernel.node_to_bounds, dict), type(
+                self.kernel.node_to_bounds
+            )
+            return self.kernel.node_to_bounds.get(fx_node, ValueRanges.unknown())
+        elif config.compute_all_bounds and hasattr(ValueRangeAnalysis, name):
+            # These create lots of inner strings. We would need to compute the bounds at the ops
+            # We will also likely not get much from computing VRs on these nodes
+            if any(s in fx_node.target for s in ("set_indirect", "reduction", "scan")):
+                return ValueRanges.unknown()
+            # We assume that the inputs come from `ops.` and are not strings. If you want to generate
+            # intermediary strings, wrap them in CSE variables with properly initialised bounds.
+            # If there is no FX bound but we know how to compute one we do so
+            assert not kwargs
+            def arg_to_bound(x: Any) -> Any:
+                if isinstance(x, CSEVariable):
+                    return x.bounds
+                elif isinstance(x, sympy.Expr):
+                    return bound_sympy(x)
+                else:
+                    return x
+            arg_bounds = list(map(arg_to_bound, args))
+            return getattr(self.vr_analysis, name)(*arg_bounds)
+        return ValueRanges.unknown()
+    def indirect_indexing(
+        self,
+        var: CSEVariable,
+        size: Union[sympy.Expr, int],
+        check: bool = True,
+        wrap_neg: bool = True,
+    ) -> sympy.Symbol:
+        if isinstance(size, int):
+            size = sympy.Integer(size)
+        assert isinstance(size, sympy.Expr), (type(size), size)
+        # Skip CSE since this doesn't return an expression
+        if var.bounds.lower < 0:
+            if wrap_neg:
+                stm = ops.add(var, ops.index_expr(size, torch.long))
+                # Mixed negative and non-negative
+                if var.bounds.upper >= 0:
+                    lt = ops.lt(var, 0)
+                    stm = ops.where(lt, stm, var)
+            else:
+                stm = var
+            # Propagate bounds as we know how to compute them properly
+            new_bounds = ValueRanges.unknown()
+            if var.bounds != ValueRanges.unknown() and isinstance(size, sympy.Number):
+                # Take the negative part of the bound and add size to it
+                # Then take union of that and the positive part
+                # This is a tighter bound than that of a generic ops.where, as we have info on the cond
+                neg_bounds = var.bounds & ValueRanges(-int_oo, -1)
+                new_bounds = ValueRanges(
+                    neg_bounds.lower + size, neg_bounds.upper + size
+                )
+                # We don't have a good way of representing the empty range
+                if var.bounds.upper >= 0:
+                    pos = var.bounds & ValueRanges(0, int_oo)
+                    new_bounds = new_bounds | pos
+            var = self.kernel.cse.generate(self.kernel.compute, stm, bounds=new_bounds)
+        sympy_var = self.parent_handler.indirect_indexing(var, size, check)
+        if generate_assert(check):
+            assert_lower = not (var.bounds.lower >= 0)
+            # value ranges cannot x < s when x and s are symbols
+            assert_upper = not isinstance(size, sympy.Number) or not (
+                var.bounds.upper < size
+            )
+            self.kernel.check_bounds(sympy_var, size, assert_lower, assert_upper)
+        return sympy_var
+    def check_bounds(
+        self, expr: sympy.Expr, size: sympy.Expr, lower: bool, upper: bool
+    ) -> None:
+        return self.kernel.check_bounds(expr, size, lower, upper)
+    def load(self, name: str, index: sympy.Expr) -> CSEVariable:
+        if name in self.kernel.cse.invalidated_stores:
+            # A load from an invalidated store requires us to
+            # keep the actual buffer around
+            V.kernel.must_keep_buffers.add(name)
+        if free_symbol_is_type(index, SymT.TMP):
+            return self.kernel.indirect_load(name, index)
+        store_cache = self.kernel.cse.store_cache
+        if name in store_cache:
+            return store_cache[name]
+        out = self.kernel.load(name, index)
+        # count load that is not in the store_cache, and also not in the
+        # cse cache.
+        if out.use_count == 1:
+            self.kernel.num_load += 1
+        return out
+    def _update_store_cache(self, name: str, value: CSEVariable) -> None:
+        self.kernel.cse.store_cache[name] = value
+        if self.kernel.current_node and name in V.graph.name_to_buffer:
+            buf = self.kernel.current_node.get_output(name)
+            for other_name in buf.get_mutations():
+                self.kernel.cse.store_cache[other_name] = value
+    def store(
+        self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None
+    ) -> None:
+        self.kernel.store_buffer_names.add(name)
+        if mode is None:
+            self._update_store_cache(name, value)
+        if name not in V.graph.removed_buffers:
+            self.kernel.store(name, index, value, mode=mode)
+    def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable) -> None:
+        self.kernel.store_buffer_names.add(name)
+        self._update_store_cache(name, value)
+        if name not in V.graph.removed_buffers:
+            return self.kernel.store_reduction(name, index, value)
+    def reduction(
+        self,
+        dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        reduction_type: ReductionType,
+        value: Union[CSEVariable, tuple[CSEVariable, ...]],
+    ) -> Union[CSEVariable, tuple[CSEVariable, ...]]:
+        self.kernel.num_reduction += 1
+        return self.kernel.reduction(dtype, src_dtype, reduction_type, value)
+    def scan(
+        self,
+        dtypes: tuple[torch.dtype, ...],
+        combine_fn: Callable[
+            [tuple[CSEVariable, ...], tuple[CSEVariable, ...]],
+            tuple[CSEVariable, ...],
+        ],
+        values: tuple[CSEVariable, ...],
+    ) -> tuple[CSEVariable, ...]:
+        return self.kernel.scan(dtypes, combine_fn, values)
+    def sort(
+        self,
+        dtypes: tuple[torch.dtype, ...],
+        values: tuple[CSEVariable, ...],
+        stable: bool,
+        descending: bool,
+    ) -> tuple[CSEVariable, ...]:
+        return self.kernel.sort(dtypes, values, stable, descending)
+    def bucketize(
+        self,
+        values: CSEVariable,
+        boundaries: tuple[str, sympy.Expr, sympy.Expr, sympy.Expr],
+        boundary_indices: CSEVariable,
+        indexing_dtype: torch.dtype,
+        right: bool,
+        sorter: Optional[tuple[str, sympy.Expr]] = None,
+        sorter_indices: Optional[CSEVariable] = None,
+    ) -> CSEVariable:
+        """
+        [Note: Inductor bucketize op]
+        Inputs:
+        -------
+        values: the values to be bucketized.
+        boundaries: a tuple containing
+          (a) the name of the boundaries tensor (which must be sorted, unless
+          the sorting tensor is present),
+          (b) the length of the tensor in the last dimension (i.e. the length of
+          one set of boundaries),
+          (c) the number of elements in the underlying storage (i.e. the length
+          of the flattened tensor, ignoring striding), and
+          (d) the stride of the tensor in the last dimension.
+        boundary_indices: indices into a flattened version of the boundaries
+        tensor, of the same size and shape as "values".  Each index points to
+        the first element in the set of boundaries to be used for the
+        corresponding value.
+        indexing_dtype: the dtype to use when indexing into the boundaries
+        tensor.  This must be int64 or int32.  This additionally specifies the
+        dtype of the return value.
+        right: see "Details" below.
+        sorter: an optional tuple containing
+          (a) the name of an optional sorting tensor, used to access unsorted
+          boundaries without reordering the boundaries tensor, and
+          (b) the stride of the tensor in the last dimension.
+        The values in the sorting tensor are used as indices into the *last*
+        dimension of the boundaries tensor, with all other indices matching.
+        The size of the sorting and boundaries tensors must be equivalent.
+        sorter_indices: must be present if the sorting array is present; see
+        "boundary_indices" for the equivalent definition for the boundaries
+        tensor.
+        Output:
+        -------
+        The buckets each value belongs in, within a given set of boundaries.  0
+        indicates a position before the first boundary, and len(boundaries_set)
+        represents a position after the last boundary.
+        Details:
+        --------
+        Given a value and a set of boundaries, calculate the bucket that each
+        value belongs to.  This works differently in 1-D and N-D cases.
+        for values [[-1, 0, 1, 2], [3, 4, 5, 9]], boundaries [0, 4, 4, 8], right=True
+        return =   [[ 0, 1, 1, 1], [1, 3, 3, 4]].
+        for values [[-1, 0, 1, 2], [3, 4, 5, 9]], boundaries [[0, 4], [4, 8]], right=True
+        return =   [[ 0, 1, 1, 1], [0, 1, 1, 2]]
+        Note that in the N-D boundaries case, the shape of "values" and
+        "boundaries" must match in every dimension _except_ the last.
+        When right == False, bucket i refers to range (boundaries[i], boundaries[i+1]].
+        When right == True,  bucket i refers to range [boundaries[i], boundaries[i+1]).
+        Boundaries must be non-decreasing, or a sorter must be provided which
+        would re-index offsets in a non-decreasing order (e.g. the second output
+        of torch.sort(offsets)).  Otherwise, the result is undefined.
+        """
+        return self.kernel.bucketize(
+            values,
+            boundaries,
+            boundary_indices,
+            indexing_dtype,
+            right,
+            sorter,
+            sorter_indices,
+        )

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp.py ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_bmm_template.py ADDED Viewed

	@@ -0,0 +1,262 @@

+# mypy: allow-untyped-defs
+import contextlib
+import itertools
+from typing import Any, Callable, Optional
+from unittest.mock import patch
+import sympy
+from .. import ir
+from ..select_algorithm import PartialRender
+from ..virtualized import V
+from .common import ArgName
+from .cpp_gemm_template import CppGemmTemplate, GEMM_TEMPLATE
+from .cpp_micro_gemm import LayoutType
+from .cpp_template_kernel import CppTemplateKernel
+from .cpp_utils import DTYPE_TO_CPP, GemmBlocking
+# We pass all sizevars present in BY to the GEMM templates so variables are not renamed in the BMM definition
+GEMM_SINGLE_THREAD_MM_STUB = r"""
+{{kernel.def_kernel(
+    inputs={"X": X, "W": W},
+    outputs={"Y": Y_2d},
+    aliases=aliases,
+    function_name=kernel_name+"_single_thread_mm",
+    extra_sizevars=BY_sizevars + [b_index],
+    placeholder="<SINGLE_THREAD_MM_DEF_FOR_BMM>")}}"""
+GEMM_THREADED_MM_STUB = r"""
+{{kernel.def_kernel(
+    inputs={"X": X, "W": W},
+    outputs={"Y": Y_2d},
+    aliases=aliases,
+    function_name=kernel_name+"_threaded_mm",
+    extra_sizevars=BY_sizevars + [b_index],
+    placeholder="<THREADED_MM_DEF_FOR_BMM>")}}"""
+BMM_TEMPLATE = r"""
+{{ template.codegen_microkernel_def() }}
+{{ template.codegen_single_thread_gemm() }}
+{{ template.codegen_multi_thread_gemm() }}
+extern "C"
+{{kernel.def_kernel(inputs={"X": BX, "W": BW}, outputs={"Y": BY}, aliases=aliases)}}
+{
+    const int64_t B = {{kernel.size(BY_2d, 0)}};
+    {%- if num_threads > 1 %}
+    constexpr int64_t num_threads = {{num_threads}};
+    int64_t B_single_thread_block = (B / num_threads) * num_threads;
+    #pragma omp parallel for num_threads({{num_threads}})
+    {%- else %}
+    int64_t B_single_thread_block = B;
+    {%- endif %}
+    for (int64_t b_start = 0; b_start < B_single_thread_block; ++b_start) {
+        {{template.get_gemm_function_call(
+            kernel,
+            kernel_name+"_single_thread_mm",
+            "<SINGLE_THREAD_CALL_FOR_BMM>",
+            b_index="b_start",
+        )}}
+    }
+    for (int64_t b_start = B_single_thread_block; b_start < B; ++b_start) {
+        {{template.get_gemm_function_call(
+            kernel,
+            kernel_name+"_threaded_mm",
+            "<THREADED_MM_CALL_FOR_BMM>",
+            b_index="b_start",
+        )}}
+    }
+}
+"""
+class CppBmmTemplate(CppGemmTemplate):
+    def __init__(
+        self,
+        input_nodes,
+        layout: ir.Layout,
+        num_threads: int,
+        register_blocking: GemmBlocking,
+        beta=1,
+        alpha=1,
+        has_bias=False,
+        epilogue_creator: Optional[Callable[[ir.Buffer], ir.Pointwise]] = None,
+        should_block_weights: bool = False,
+        name="bmm",
+    ):
+        """
+        In order to simplify the implementation and increase code reuse, the BMM template implements
+        two versions of the GEMM kernel: a single-threaded version and a multi-threaded version.
+        GEMM kernels are called in a loop over the batch dimension, with single-threaded GEMM calls
+        for all but the last (B % num_threads), which are handled by the multi-threaded GEMM kernel.
+        We use an extra sizevar `b_index` to index the batch dimension, which we pass into the GEMM
+        template as a sympy.Symbol. This allows us to slice the 3D batch tensors in the GEMM template
+        without any changes to the GEMM template itself.
+        """
+        super().__init__(
+            input_nodes,
+            layout,
+            num_threads,
+            register_blocking,
+            beta=beta,
+            alpha=alpha,
+            has_bias=has_bias,
+            epilogue_creator=epilogue_creator,
+            should_block_weights=should_block_weights,
+            name=name,
+        )
+        self.b_index = sympy.Symbol("s_b_index", integer=True, nonnegative=True)
+    @staticmethod
+    def get_padded_size(n, block_n, k, should_block_weight):
+        if should_block_weight:
+            # Tensor is constant or not contiguous, so we will pad and block
+            new_size, padded_n = CppGemmTemplate.get_padded_size(
+                n, block_n, k, should_block_weight
+            )
+            # Add the new batch dimension
+            new_size.insert(0, -1)
+            return new_size, padded_n
+        else:
+            new_size = [-1, k, n]
+            return new_size, n
+    @staticmethod
+    def check_if_block_weight(W, micro_gemm):
+        assert isinstance(W, ir.IRNode)
+        _, n = W.get_size()[-2:]
+        result = (
+            not W.get_layout().is_contiguous()
+            or W.get_name() in V.graph.constants
+            or (
+                n % micro_gemm.register_blocking.block_n != 0
+                and micro_gemm.get_b_layout != LayoutType.NORMAL
+            )
+        )
+        return result
+    def get_gemm_function_call(
+        self,
+        kernel: CppTemplateKernel,
+        function_name: str,
+        placeholder: str,
+        b_index: str,
+    ) -> str:
+        """
+        Similar to 'def_kernel' in cpp_template_kernel, but instead of generating a function definition,
+        generate a function call for the GEMM kernel.
+        Args:
+            placeholder: The string to replace the function call with
+            b_index: The index for slicing the 3D batch tensors
+        """
+        def hook():
+            arg_defs, call_args, _, _ = kernel.args.python_argdefs()
+            for i, buf in enumerate(call_args):
+                if buf == self.b_index:
+                    arg_defs[i] = ArgName(b_index)
+            call = f"{function_name}({', '.join(x.full_name() for x in arg_defs)});"
+            return call
+        assert placeholder not in kernel.render_hooks
+        kernel.render_hooks[placeholder] = hook
+        return placeholder
+    def get_default_reindexers(self, epilogue_nodes):
+        def reindexer(args):
+            # if epilogue nodes exist, they have 3D ranges but args are 2D, so add 0 index
+            return [self.b_index] + args
+        return [reindexer] * len(epilogue_nodes)
+    def get_options(
+        self,
+        kernel: CppTemplateKernel,
+        template_buffer_node: Optional[ir.CppTemplateBuffer] = None,
+        flag_template_buffer_has_other_users: Optional[bool] = None,
+        epilogue_nodes: Optional[list[ir.IRNode]] = None,
+        **kwargs,
+    ) -> dict[str, Any]:
+        options = super().get_options(
+            kernel=kernel,
+            template_buffer_node=template_buffer_node,
+            flag_template_buffer_has_other_users=flag_template_buffer_has_other_users,
+            epilogue_nodes=epilogue_nodes,
+            **kwargs,
+        )
+        BX, BW, BY = options["X"], options["W"], options["Y"]
+        options["BX"], options["BW"], options["BY"] = BX, BW, BY
+        options["BY_2d"] = options["Y_2d"]
+        for kword in ["X", "W", "GemmOut", "Y_2d"]:
+            options[kword] = kernel.select(options[kword], 0, self.b_index)
+        for kword in ["X", "W", "Y_2d"]:
+            options[kword + "_dtype"] = DTYPE_TO_CPP[options[kword].dtype]
+        options["b_index"] = self.b_index
+        options["BY_sizevars"] = [
+            s
+            for sym in itertools.chain(BY.get_size(), BY.get_stride())
+            if isinstance(sym, sympy.Expr)
+            for s in sym.free_symbols
+        ]
+        options["kernel_name"] = kernel.kernel_name
+        return options
+    def render(  # type: ignore[override, return]
+        self,
+        kernel: CppTemplateKernel,
+        template_buffer_node: Optional[ir.CppTemplateBuffer] = None,
+        flag_template_buffer_has_other_users: Optional[bool] = None,
+        epilogue_nodes: Optional[list[ir.IRNode]] = None,
+        **kwargs,
+    ) -> str:
+        options = self.get_options(
+            kernel=kernel,
+            template_buffer_node=template_buffer_node,
+            flag_template_buffer_has_other_users=flag_template_buffer_has_other_users,
+            epilogue_nodes=epilogue_nodes,
+            **kwargs,
+        )
+        self.render_options = options
+        with contextlib.ExitStack() as stack:
+            for buf in options["fake_buffers"]:
+                stack.enter_context(
+                    patch.object(V.graph, "get_dtype", self._fake_get_dtype(buf))
+                )
+            result = self._template_from_string(BMM_TEMPLATE).render(**options)
+            # Finalize the function definitions for the gemm routines
+            sub_mm_hooks = {
+                name: hook
+                for name, hook in kernel.render_hooks.items()
+                if "FOR_BMM" in name
+            }
+            result = PartialRender(result, sub_mm_hooks).finalize_all()
+            for name in sub_mm_hooks:
+                del kernel.render_hooks[name]
+            del kernel.args.sizevars[options["b_index"]]
+            return result
+    def codegen_single_thread_gemm(self):
+        stub = self._template_from_string(GEMM_SINGLE_THREAD_MM_STUB).render(
+            self.render_options
+        )
+        return stub + self._template_from_string(GEMM_TEMPLATE).render(
+            {**self.render_options, "num_threads": 1}
+        )
+    def codegen_multi_thread_gemm(self):
+        stub = self._template_from_string(GEMM_THREADED_MM_STUB).render(
+            self.render_options
+        )
+        return stub + self._template_from_string(GEMM_TEMPLATE).render(
+            self.render_options
+        )
+    def codegen_gemm_stub_def(self):
+        return ""

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_flex_attention_template.py ADDED Viewed

	@@ -0,0 +1,1081 @@

+# mypy: allow-untyped-defs
+import contextlib
+import logging
+import re
+from typing import Optional
+from unittest.mock import patch
+import sympy
+import torch
+import torch.utils
+from ...utils._ordered_set import OrderedSet
+from .. import ir
+from ..ir import TensorBox
+from ..select_algorithm import DataProcessorTemplateWrapper
+from ..utils import parallel_num_threads
+from ..virtualized import V
+from .cpp_template import CppTemplate
+from .cpp_utils import GemmBlocking
+log = logging.getLogger(__name__)
+# TODO: reuse cpp codegen to generate below pointwise/reduction kernels
+SOFTMAX_FUSIONS = r"""
+// 1) out = exp(a - val)
+// 2) val = sum(out)
+template <typename T1, typename T2>
+inline void {{kernel_name}}_exp_reduce_sum_fusion_kernel(
+    T1* a,
+    const int& size,
+    T2* out,
+    T1& val) {
+  auto vec_size = at::vec::Vectorized<T1>::size();
+  auto vec_max = at::vec::Vectorized<T1>(val);
+  T1 tmp_sum = 0;
+  auto vec_tmp_sum = at::vec::Vectorized<T1>(tmp_sum);
+  for (long i = 0; i < vec_size * (size / vec_size); i += vec_size) {
+    auto tmp0 = at::vec::Vectorized<T1>::loadu(a + i);
+    auto tmp1 = tmp0 - vec_max;
+    auto tmp2 = tmp1.exp_u20();
+    vec_tmp_sum += tmp2;
+    at::native::_store(out + i, tmp2);
+  }
+  tmp_sum = at::vec::vec_reduce_all<T1>(
+      [](at::vec::Vectorized<T1>& x, at::vec::Vectorized<T1>& y) {
+        return x + y;
+      },
+      vec_tmp_sum);
+  for (long i = vec_size * (size / vec_size); i < size; i++) {
+    auto tmp0 = a[i];
+    auto tmp1 = tmp0 - val;
+    auto tmp2 = exp(tmp1);
+    tmp_sum += tmp2;
+    out[i] = tmp2;
+  }
+  val = tmp_sum;
+}
+// 1) out = a * scale
+// 2) max = max(out)
+template <typename scalar_t>
+inline void {{kernel_name}}_mul_reduce_max_fusion_kernel(
+    const scalar_t* a,
+    const scalar_t& scale,
+    const int& size,
+    scalar_t* out,
+    scalar_t& max) {
+  auto vec_size = at::vec::Vectorized<scalar_t>::size();
+  auto vec_scale = at::vec::Vectorized<scalar_t>(scale);
+  scalar_t tmp_max = -std::numeric_limits<scalar_t>::infinity();
+  auto vec_tmp_max = at::vec::Vectorized<scalar_t>(tmp_max);
+  for (long i = 0; i < vec_size * (size / vec_size); i += vec_size) {
+    auto tmp0 = at::vec::Vectorized<scalar_t>::loadu(a + i);
+    auto tmp1 = tmp0 * vec_scale;
+    vec_tmp_max = at::vec::maximum(vec_tmp_max, tmp1);
+    at::native::_store(out + i, tmp1);
+  }
+  for (long i = vec_size * (size / vec_size); i < size; i++) {
+    auto tmp0 = a[i];
+    auto tmp1 = tmp0 * scale;
+    tmp_max = std::max(tmp_max, tmp1);
+    out[i] = tmp1;
+  }
+  max = std::max(
+      tmp_max,
+      at::vec::vec_reduce_all<scalar_t>(
+          [](at::vec::Vectorized<scalar_t>& x, at::vec::Vectorized<scalar_t>& y) {
+            return at::vec::maximum(x, y);
+          },
+          vec_tmp_max));
+}
+template <typename scalar_t>
+static inline scalar_t* {{kernel_name}}_conditional_data_ptr(scalar_t* ptr, scalar_t* ptr2) {
+  TORCH_CHECK(ptr2 == nullptr);
+  return ptr;
+}
+template <typename scalar_t,
+          typename std::enable_if_t<c10::is_reduced_floating_point_v<scalar_t>, int> = 0>
+static inline scalar_t* {{kernel_name}}_conditional_data_ptr(float* ptr, scalar_t* ptr2) {
+  return ptr2;
+}
+template <typename scalar_t>
+inline void {{kernel_name}}_fill_stub(scalar_t* data, scalar_t val, int64_t size) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+  Vec data_vec = Vec(val);
+  int64_t d = 0;
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
+    data_vec.store(data + d);
+  }
+  #if !defined(_MSC_VER) && !defined(COMPILING_FOR_MIN_SIZE)
+  # pragma unroll
+  #endif
+  for (; d < size; d++) {
+    data[d] = val;
+  }
+}
+// out = a * scale
+template <typename scalar_t>
+inline void {{kernel_name}}_mul_scale_kernel(
+    scalar_t* a,
+    scalar_t scale,
+    int64_t size) {
+  auto vec_size = at::vec::Vectorized<scalar_t>::size();
+  auto vec_scale = at::vec::Vectorized<scalar_t>(scale);
+  for (int64_t i = 0; i < vec_size * (size / vec_size); i += vec_size) {
+    auto tmp0 = at::vec::Vectorized<scalar_t>::loadu(a + i);
+    auto tmp1 = tmp0 * vec_scale;
+    at::native::_store(a + i, tmp1);
+  }
+  for (int64_t i = vec_size * (size / vec_size); i < size; i++) {
+    auto tmp0 = a[i];
+    auto tmp1 = tmp0 * scale;
+    a[i] = tmp1;
+  }
+}
+"""
+BRGEMM_PACK_FUNCTIONS = r"""
+template <typename scalar_t>
+inline void {{kernel_name}}_copy_value_with_pad(
+    const scalar_t* value_ptr,
+    scalar_t* dst_ptr,
+    int64_t rows,
+    int64_t cols,
+    int64_t prows,
+    int64_t pcols,
+    int64_t ldi) {
+  auto vec_size = at::vec::Vectorized<scalar_t>::size();
+  int64_t i = 0;
+  for (; i < rows; i++) {
+    int64_t j = 0;
+    for (; j < cols - (cols % vec_size); j += vec_size) {
+      auto vec_v =
+          at::vec::Vectorized<scalar_t>::loadu(value_ptr + i * ldi + j);
+      vec_v.store(dst_ptr + i * pcols + j);
+    }
+    if (j < cols) {
+      auto vec_v = at::vec::Vectorized<scalar_t>::loadu(
+          value_ptr + i * ldi + j, cols - j);
+      vec_v.store(dst_ptr + i * pcols + j, cols - j);
+    }
+    // col padding
+    auto psize = pcols - cols;
+    if (psize > 0) {
+      auto zero_vec = at::vec::Vectorized<scalar_t>(0);
+      int64_t pj = 0;
+      for (; pj < psize - (psize % vec_size); pj += vec_size) {
+        zero_vec.store(dst_ptr + i * pcols + cols + pj);
+      }
+      if (pj < psize) {
+        zero_vec.store(dst_ptr + i * pcols + cols + pj, psize - pj);
+      }
+    }
+  }
+  // row padding
+  for (; i < prows; i++) {
+    auto zero_vec = at::vec::Vectorized<scalar_t>(0);
+    int64_t j = 0;
+    for (; j < pcols - (pcols % vec_size); j += vec_size) {
+      zero_vec.store(dst_ptr + i * pcols + j);
+    }
+    if (j < pcols) {
+      zero_vec.store(dst_ptr + i * pcols + j, pcols - j);
+    }
+  }
+}
+"""
+MICRO_GEMM_TEMPLATE = r"""
+GEMM_DEFINE
+"""
+ALLOCATE_BUFFER = r"""
+  int64_t {{buffer_name}}_dtype_itemsize = c10::is_reduced_floating_point_v<{{buffer_dtype}}> ? 2 : 4;
+  auto& {{buffer_name}}_allocator = *at::getCPUAllocator();
+  auto {{buffer_name}}_work_data = {{buffer_name}}_allocator.allocate({{buffer_size}}*{{buffer_name}}_dtype_itemsize);
+  void* {{buffer_name}}_data_ptr = {{buffer_name}}_work_data.get();
+  {{buffer_dtype}}* {{buffer_name}} = ({{buffer_dtype}}*){{buffer_name}}_data_ptr;
+"""
+FLEX_ATTENTION_TEMPLATE = r"""
+{{template.header().getvalue()}}
+#include <ATen/native/cpu/utils.h>
+#include <ATen/native/CPUBlas.h>
+#include <ATen/Context.h>
+{{template.codegen_micro_gemm(kernel.kernel_name)}}
+{{template.codegen_softmax_fusion(kernel.kernel_name)}}
+{{template.codegen_brgemm_pack_function(kernel.kernel_name)}}
+{%- set kernel_args = {"query": query, "key": key, "value": value,
+                       "kv_num_blocks": kv_num_blocks, "kv_indices": kv_indices,
+                       "full_kv_num_blocks": full_kv_num_blocks, "full_kv_indices": full_kv_indices } %}
+{%- set kernel_args = template.update_kernel_args(kernel_args) %}
+extern "C"
+{{kernel.def_kernel(inputs=kernel_args, outputs={"output": output}, extra_sizevars=template.extra_sizevars)}}
+{
+  {{ kernel.maybe_codegen_profile() }}
+  int64_t qBlockSize = {{qBlockSize}};
+  int64_t kvBlockSize = {{kvBlockSize}};
+  int64_t num_thread = {{num_thread}};
+  // dtypes of kernel and internal buffers
+  using scalar_t = {{kernel.dtype(query)}};
+  constexpr bool is_reduced_type = c10::is_reduced_floating_point_v<scalar_t>;
+  using accum_t = at::opmath_type<{{kernel.dtype(query)}}>;
+  using Vec = at::vec::Vectorized<accum_t>;
+  accum_t scaling_factor = {{scale}};
+  int64_t batchSize = {{kernel.size(query, 0)}};
+  int64_t qSize = {{kernel.size(query, 1)}};
+  int64_t num_head = {{kernel.size(query, 2)}};
+  int64_t headSize = {{kernel.size(query, 3)}};
+  int64_t batchSize_k = {{kernel.size(key, 0)}};
+  int64_t num_head_k = {{kernel.size(key, 2)}};
+  int64_t headSize_v = {{kernel.size(value, 3)}};
+  bool is_broadcast_bs_kv = batchSize != batchSize_k;
+  bool is_broadcast_head_kv = num_head != num_head_k;
+  int64_t gqa_shards = num_head / num_head_k;
+  int64_t bs_shards = batchSize / batchSize_k;
+  int64_t batchSize_kvi = {{kernel.size(kv_indices, 0)}};
+  int64_t num_head_kvi = {{kernel.size(kv_indices, 1)}};
+  int64_t block_num_kvi = {{kernel.size(kv_indices, 3)}};
+  bool is_broadcast_bs_kvi = batchSize != batchSize_kvi;
+  bool is_broadcast_head_kvi = num_head != num_head_kvi;
+  int64_t gqa_shards_kvi = num_head / num_head_kvi;
+  int64_t bs_shards_kvi = batchSize / batchSize_kvi;
+  int64_t kviStrideB = {{kernel.stride(kv_indices, 0)}};
+  int64_t kviStrideH = {{kernel.stride(kv_indices, 1)}};
+  int64_t kviStrideQ = {{kernel.stride(kv_indices, 2)}};
+  int64_t num_kviStrideB = {{kernel.stride(kv_num_blocks, 0)}};
+  int64_t num_kviStrideH = {{kernel.stride(kv_num_blocks, 1)}};
+{%- if has_full_kv_block %}
+  int64_t full_kviStrideB = {{kernel.stride(full_kv_indices, 0)}};
+  int64_t full_kviStrideH = {{kernel.stride(full_kv_indices, 1)}};
+  int64_t full_kviStrideQ = {{kernel.stride(full_kv_indices, 2)}};
+  int64_t full_num_kviStrideB = {{kernel.stride(full_kv_num_blocks, 0)}};
+  int64_t full_num_kviStrideH = {{kernel.stride(full_kv_num_blocks, 1)}};
+  auto full_kv_indices_data = full_kv_indices;
+  auto full_kv_num_blocks_data = full_kv_num_blocks;
+{%- endif %}
+  auto kv_num_blocks_data = kv_num_blocks;
+  auto kv_indices_data = kv_indices;
+  // Strides
+  int64_t qStrideB = {{kernel.stride(query, 0)}};
+  int64_t qStrideM = {{kernel.stride(query, 1)}};
+  int64_t qStrideH = {{kernel.stride(query, 2)}};
+  int64_t kStrideB = {{kernel.stride(key, 0)}};
+  int64_t kStrideN = {{kernel.stride(key, 1)}};
+  int64_t kStrideH = {{kernel.stride(key, 2)}};
+  int64_t vStrideB = {{kernel.stride(value, 0)}};
+  int64_t vStrideN = {{kernel.stride(value, 1)}};
+  int64_t vStrideH = {{kernel.stride(value, 2)}};
+  int64_t oStrideB = {{kernel.stride(output, 0)}};
+  int64_t oStrideM = {{kernel.stride(output, 2)}};
+  int64_t oStrideH = {{kernel.stride(output, 1)}};
+  int64_t kvSize = {{kernel.size(key, 1)}};
+  int64_t qSplitSize = qBlockSize;
+  int64_t kvSplitSize = kvBlockSize;
+  qSplitSize = qSplitSize > qSize ? qSize : qSplitSize;
+  kvSplitSize = kvSplitSize > kvSize ? kvSize : kvSplitSize;
+  int64_t qSlice = (qSize + qSplitSize - 1) / qSplitSize;
+  int64_t kvSlice = (kvSize + kvSplitSize - 1) / kvSplitSize;
+  int64_t kvTail = (kvSize - 1) % kvSplitSize + 1;
+  bool need_pack = false;
+  // Whether pack is needed for BFloat16/Half
+  if (is_reduced_type) {
+    // check platform ability
+    need_pack = std::is_same_v<scalar_t, at::BFloat16> ? at::native::cpublas::could_pack(at::kBFloat16)
+                                                       : at::native::cpublas::could_pack(at::kHalf);
+  }
+  if (need_pack) {
+    // When the number of gemm is greater than the number of pack,
+    // the pack overhead can be overlapped.
+    int64_t thresh_size = 64;
+    need_pack = kvSize >= thresh_size && qSize >= thresh_size;
+    if (need_pack) {
+      double pack_size = batchSize * num_head * kvSize * headSize;
+      double qs_per_thread = (batchSize * num_head * qSlice + num_thread - 1) / num_thread;
+      double gemm_size_per_thread = qs_per_thread * qSplitSize * kvSize * headSize;
+      need_pack = gemm_size_per_thread / pack_size >= 4;
+    }
+  }
+  // Pad is needed for packing when K is not even
+  bool headSize_even = headSize % 2 == 0;
+  int64_t eheadSize = need_pack && !headSize_even ? headSize + 1: headSize;
+  int64_t ekvSplitSize = need_pack && (kvSplitSize % 2 != 0) ? kvSplitSize + 1 : kvSplitSize;
+  int64_t ekvTail = need_pack && (kvTail % 2 != 0) ? kvTail + 1 : kvTail;
+  int64_t kv_padding_size = (kvSize - 1) / kvSplitSize * ekvSplitSize + ekvTail;
+  // Allocate per thread temp buf (accumulate type)
+  int64_t _size_per_thread =
+      /* qk     */ qSplitSize * kvSplitSize +
+      /* qk_max */ qSplitSize +
+      /* qk_sum */ qSplitSize +
+      /* dst    */ qSplitSize * headSize_v;
+  // Inputs/outputs buffers
+  const scalar_t* q_data = query;
+  const scalar_t* k_data = key;
+  const scalar_t* v_data = value;
+  scalar_t* out_data = output;
+  // Buffers to store accum results, padding query and transpose/packing key/value
+  {{template.codegen_allocate_buffer("buf_data", "accum_t", "num_thread*_size_per_thread")}}
+  {{template.codegen_allocate_buffer("buf_reduced_data", "scalar_t", "num_thread*qSplitSize*ekvSplitSize")}}
+  {{template.codegen_allocate_buffer("key_reorder_ptr", "scalar_t", "batchSize_k*num_head_k*eheadSize*kvSize")}}
+  {{template.codegen_allocate_buffer("value_reorder_ptr", "scalar_t", "batchSize_k*num_head_k*kv_padding_size*headSize_v")}}
+  {{template.codegen_allocate_buffer("transpose_buffer_ptr", "scalar_t", "num_thread*kvSplitSize*headSize")}}
+  {{template.codegen_allocate_buffer("query_padding_ptr", "scalar_t", "num_thread*qSplitSize*eheadSize")}}
+  if (need_pack) {
+    // Pack K, V
+    at::parallel_for(0, batchSize_k * num_head_k * kvSlice, 1, [&](int64_t begin, int64_t end) {
+      int ompIdx = at::get_thread_num();
+      int64_t i = 0, j = 0, l = 0, n = 0;
+      scalar_t* transpose_ptr = need_pack? transpose_buffer_ptr + ompIdx * kvSplitSize * headSize : nullptr;
+      at::native::data_index_init(begin, i, batchSize_k, j, num_head_k, l, kvSlice);
+      for ([[maybe_unused]] auto z : c10::irange(begin, end)) {
+        n = l * kvSplitSize;
+        int64_t cur_kvSplitSize = std::min(kvSplitSize, kvSize - n);
+        auto k_addr =
+              k_data + i * kStrideB + j * kStrideH + n * kStrideN;
+        auto v_addr =
+              v_data + i * vStrideB + j * vStrideH + n * vStrideN;
+        // transpose [cur_kvSplitSize, headSize] -> [headSize, cur_kvSplitSize]
+        at::native::utils::transpose<uint16_t>(
+          cur_kvSplitSize,
+          headSize,
+          /* src_ptr */
+          reinterpret_cast<const uint16_t*>(k_addr),
+          /* ld_src */ kStrideN,
+          /* dst */ reinterpret_cast<uint16_t*>(transpose_ptr),
+          /* ld_dst */ cur_kvSplitSize);
+        // Pack [headSize, cur_kvSplitSize]
+        at::vec::pack_vnni2(
+          /* src */ reinterpret_cast<const uint16_t*>(transpose_ptr),
+          /* dst */ reinterpret_cast<uint16_t*>(key_reorder_ptr + i * num_head_k * eheadSize * kvSize +
+                  j * eheadSize * kvSize + n * eheadSize),
+          /* ld_src */ cur_kvSplitSize,
+          /* K */ headSize,
+          /* N */ cur_kvSplitSize);
+        // Pack [cur_kvSplitSize, headSize_v]
+        at::vec::pack_vnni2(
+          /* src */ reinterpret_cast<const uint16_t*>(v_addr),
+          /* dst */ reinterpret_cast<uint16_t*>(value_reorder_ptr +
+                  i * num_head_k * kv_padding_size * headSize_v +
+                  j * kv_padding_size * headSize_v + n * headSize_v),
+          /* ld_src */ vStrideN,
+          /* K */ cur_kvSplitSize,
+          /* N */ headSize_v);
+      // Move to the next query
+      at::native::data_index_step(i, batchSize_k, j, num_head_k, l, kvSlice);
+      }
+    });
+  }
+  // Attention loop below
+  at::parallel_for(0, batchSize * num_head * qSlice, 1, [&](int64_t begin, int64_t end) {
+    int64_t i = 0, j = 0, k = 0;
+    at::native::data_index_init(begin, i, batchSize, j, num_head, k, qSlice);
+    int ompIdx = at::get_thread_num();
+    accum_t* buf_ptr = buf_data + ompIdx * _size_per_thread;
+    accum_t* qk_data = buf_ptr;
+    accum_t* qk_max_data = qk_data + qSplitSize * kvSplitSize;
+    accum_t* qk_sum_data = qk_max_data + qSplitSize;
+    accum_t* dst_data = qk_sum_data + qSplitSize;
+    scalar_t *qk_reduced_data =
+        is_reduced_type
+            ? buf_reduced_data + ompIdx * qSplitSize * ekvSplitSize
+            : nullptr;
+    scalar_t* query_t_padding_ptr = (!headSize_even && need_pack)
+            ? query_padding_ptr + ompIdx * qSplitSize * eheadSize
+            : nullptr;
+    for ([[maybe_unused]] auto z : c10::irange(begin, end)) {
+      auto i_kvi = is_broadcast_bs_kvi ? i/bs_shards_kvi : i;
+      auto j_kvi = is_broadcast_head_kvi ? j/gqa_shards_kvi : j;
+      auto kv_logical_num_data = kv_num_blocks_data + i_kvi * num_kviStrideB +
+                              j_kvi * num_kviStrideH + k;
+      int kv_indice_num = *kv_logical_num_data;
+      std::vector<int> kv_indice_list(kv_indice_num);
+      for(int kv_i = 0; kv_i < kv_indice_num; kv_i++){
+        auto kv_logical_data = kv_indices_data + i_kvi * kviStrideB +
+                                  j_kvi * kviStrideH + k*kviStrideQ + kv_i;
+        kv_indice_list[kv_i] = *kv_logical_data;
+      }
+      bool is_skip_kv = kv_indice_num > 0 ? false : true;
+{%- if has_full_kv_block %}
+      auto full_kv_logical_num_data = full_kv_num_blocks_data + i_kvi * num_kviStrideB +
+                              j_kvi * num_kviStrideH + k;
+      int full_kv_indice_num = *full_kv_logical_num_data;
+      std::vector<int> full_kv_indice_list(full_kv_indice_num);
+      for(int kv_i = 0; kv_i < full_kv_indice_num; kv_i++){
+        auto full_kv_logical_data = full_kv_indices_data + i_kvi * full_kviStrideB +
+                                  j_kvi * full_kviStrideH + k*full_kviStrideQ + kv_i;
+        full_kv_indice_list[kv_i] = *full_kv_logical_data;
+      }
+      is_skip_kv = kv_indice_num + full_kv_indice_num > 0 ? false : true;
+{%- endif %}
+      int64_t m = k * qSplitSize;
+      int64_t cur_qSplitSize = std::min(qSplitSize, qSize - m);
+      if (!is_skip_kv){
+        // Initialize max and sum
+        {{kernel.kernel_name}}_fill_stub(qk_max_data,
+            -std::numeric_limits<accum_t>::infinity(), cur_qSplitSize);
+        {{kernel.kernel_name}}_fill_stub(qk_sum_data,
+            static_cast<accum_t>(0), cur_qSplitSize);
+        if (!headSize_even && need_pack) {
+          // Pad query if headSize is not even
+          {{kernel.kernel_name}}_copy_value_with_pad<scalar_t>(
+            q_data + i * qStrideB + j * qStrideH + m * qStrideM,
+            query_t_padding_ptr,
+            cur_qSplitSize,
+            headSize,
+            cur_qSplitSize,
+            eheadSize,
+            qStrideM
+          );
+        }
+      }
+{%- if has_full_kv_block %}
+      for (int64_t n_idx = 0; n_idx < kv_indice_num + full_kv_indice_num ; n_idx += 1) {
+        auto n = n_idx < kv_indice_num ? kv_indice_list[n_idx]*kvSplitSize : full_kv_indice_list[n_idx - kv_indice_num]*kvSplitSize;
+{%- else %}
+      for (int64_t n_idx = 0; n_idx < kv_indice_num ; n_idx += 1) {
+        auto n = kv_indice_list[n_idx]*kvSplitSize;
+{%- endif %}
+        auto cur_n = n/kvSplitSize;
+        int64_t cur_kvSplitSize = std::min(kvSplitSize, kvSize - n);
+        int64_t cur_ekvSplitSize = (need_pack && cur_kvSplitSize % 2 != 0) ? cur_kvSplitSize + 1 : cur_kvSplitSize;
+        // Calculate scale * q @ k.T
+        auto i_kv = is_broadcast_bs_kv ? i/bs_shards : i;
+        auto j_kv = is_broadcast_head_kv ? j/gqa_shards : j;
+        if (!need_pack) {
+          auto k_addr =
+              k_data + i_kv * kStrideB + j_kv * kStrideH + n * kStrideN;
+          {{kernel.kernel_name}}_kernel_micro_gemm_transpose_b<static_cast<bool>(false)>(
+              q_data + i * qStrideB + j * qStrideH +
+                  m * qStrideM,
+              k_addr,
+              qk_data,
+              cur_qSplitSize,
+              cur_kvSplitSize,
+              headSize,
+              qStrideM,
+              kStrideN,
+              cur_kvSplitSize);
+        } else {
+          at::native::cpublas::brgemm(
+              cur_qSplitSize,
+              cur_kvSplitSize,
+              eheadSize,
+              headSize_even ? qStrideM : eheadSize,
+              cur_kvSplitSize,
+              cur_kvSplitSize,
+              false,
+              !headSize_even
+                  ? query_t_padding_ptr
+                  : q_data + i * qStrideB + j * qStrideH + m * qStrideM,
+              key_reorder_ptr + i_kv * num_head_k * eheadSize * kvSize +
+                  j_kv * eheadSize * kvSize + n * eheadSize,
+              qk_data,
+              need_pack);
+        }
+        {{kernel.kernel_name}}_mul_scale_kernel<accum_t>(qk_data, scaling_factor, cur_qSplitSize*cur_kvSplitSize);
+{%- if score_mod and mask_mod %}
+        // TODO: reduce the number of calls of q_idx and kv_idx initialization
+        std::vector<int64_t> q_idx(cur_qSplitSize);
+        for (int64_t i = 0; i < cur_qSplitSize; ++i) {
+          q_idx[i] = m + i;
+        }
+        std::vector<int64_t> kv_idx(cur_kvSplitSize);
+        for (int64_t i = 0; i < cur_kvSplitSize; ++i) {
+          kv_idx[i] = n + i;
+        }
+        std::vector<int64_t> b_idx = {i};
+        std::vector<int64_t> h_idx = {j};
+        accum_t* in_ptr0 = qk_data;
+        auto in_ptr1 = b_idx.data();
+        auto in_ptr2 = h_idx.data();
+        auto in_ptr3 = q_idx.data();
+        auto in_ptr4 = kv_idx.data();
+        // apply score mod function
+        {
+            {{ template.generate_other_buffer("score_others", 0, "len_score_other", kernel.args) }}
+            accum_t* out_ptr{{score_buf_idx}} = in_ptr0;
+            {{ template.modification(score_mod, score_buf_name, score_buf_idx)|indent(12, false) }}
+        }
+        if ((std::find(kv_indice_list.begin(), kv_indice_list.end(), cur_n) != kv_indice_list.end()) ){
+          // Apply block mask, fill unused with -inf
+          {
+              {{ template.generate_other_buffer("mask_others", -1, "len_mask_other", kernel.args) }}
+              accum_t* out_ptr{{mask_buf_idx}} = in_ptr0;
+              {{ template.modification(mask_mod, mask_buf_name, mask_buf_idx)|indent(12, false) }}
+          }
+        }
+{%- endif %}
+        // Update coefficients with Softmax
+        accum_t tmp_max = 0, tmp_sum = 0, exp_tmp = 0;
+        for (int64_t row = 0; row < cur_qSplitSize; ++row) {
+          // apply scaling factor and max per row in fusion
+          {{kernel.kernel_name}}_mul_reduce_max_fusion_kernel(
+              qk_data + row * cur_kvSplitSize,
+              static_cast<accum_t>(1),
+              cur_kvSplitSize,
+              qk_data + row * cur_kvSplitSize,
+              tmp_max);
+          tmp_max = qk_max_data[row] > tmp_max ? qk_max_data[row] : tmp_max;
+          if (tmp_max == -std::numeric_limits<accum_t>::infinity()) {
+            // to avoid `nan = exp2f(-inf - (-inf))`
+            {{kernel.kernel_name}}_fill_stub(
+              {{kernel.kernel_name}}_conditional_data_ptr(qk_data, qk_reduced_data) + row * cur_ekvSplitSize,
+              static_cast<scalar_t>(0), cur_kvSplitSize);
+          } else {
+            tmp_sum = tmp_max;
+            // qk <- exp(qk - max) and sum per row
+            {{kernel.kernel_name}}_exp_reduce_sum_fusion_kernel(
+              qk_data + row * cur_kvSplitSize, cur_kvSplitSize,
+              {{kernel.kernel_name}}_conditional_data_ptr(qk_data, qk_reduced_data) + row * cur_ekvSplitSize,
+              tmp_sum);
+            // exp_tmp <- exp(max[row] - max)
+            exp_tmp = std::exp(qk_max_data[row] - tmp_max);
+            // sum[row] <- sum + exp_tmp * sum[row]
+            qk_sum_data[row] = tmp_sum + exp_tmp * qk_sum_data[row];
+            // max[row] <- max
+            qk_max_data[row] = tmp_max;
+            // dst <- dst * exp_tmp
+            if (n_idx > 0) {
+              at::vec::map<accum_t>(
+              [exp_tmp](Vec x) { return x * Vec(exp_tmp); },
+              dst_data + row * headSize_v,
+              dst_data + row * headSize_v,
+              headSize_v);
+            }
+          }
+          if (need_pack && cur_kvSplitSize % 2 != 0) {
+            // Pad: [qSplitSize, cur_kvSplitSize] -> [qSplitSize, cur_kvSplitSize + 1]
+            *(qk_reduced_data + row * (1 + cur_kvSplitSize) + cur_kvSplitSize) = scalar_t(0);
+          }
+        }
+        // Calculate Softmax(q @ k.T) @ v
+        if (!need_pack) {
+          auto v_addr =
+              v_data + i_kv * vStrideB + j_kv * vStrideH + n * vStrideN;
+          // Fallback Half brgemm is slower than micro gemm
+          if (!std::is_same_v<scalar_t, at::Half>) {
+            at::native::cpublas::brgemm(
+                  cur_qSplitSize,
+                  headSize_v,
+                  cur_ekvSplitSize,
+                  cur_ekvSplitSize,
+                  vStrideN,
+                  headSize_v,
+                  n_idx > 0,
+                  {{kernel.kernel_name}}_conditional_data_ptr(qk_data, qk_reduced_data),
+                  v_addr,
+                  dst_data,
+                  need_pack);
+          } else {
+            if (n_idx > 0) {
+              {{kernel.kernel_name}}_kernel_micro_gemm<static_cast<bool>(true)>(
+                {{kernel.kernel_name}}_conditional_data_ptr(qk_data, qk_reduced_data),
+                v_addr,
+                dst_data,
+                cur_qSplitSize,
+                headSize_v,
+                cur_ekvSplitSize,
+                cur_ekvSplitSize,
+                vStrideN,
+                headSize_v);
+            } else {
+              {{kernel.kernel_name}}_kernel_micro_gemm<static_cast<bool>(false)>(
+                {{kernel.kernel_name}}_conditional_data_ptr(qk_data, qk_reduced_data),
+                v_addr,
+                dst_data,
+                cur_qSplitSize,
+                headSize_v,
+                cur_ekvSplitSize,
+                cur_ekvSplitSize,
+                vStrideN,
+                headSize_v);
+            }
+          }
+        } else {
+          int64_t psize = n / kvSplitSize * ekvSplitSize;
+          at::native::cpublas::brgemm(
+              cur_qSplitSize,
+              headSize_v,
+              cur_ekvSplitSize,
+              cur_ekvSplitSize,
+              headSize_v,
+              headSize_v,
+              n_idx > 0,
+              qk_reduced_data,
+              value_reorder_ptr +
+                  i_kv * num_head_k * kv_padding_size * headSize_v +
+                  j_kv * kv_padding_size * headSize_v + psize * headSize_v,
+              dst_data,
+              need_pack);
+        }
+      }
+      // dst <- dst / sum[row]
+      // reorder MHA output with strides
+      for (int64_t row = 0; row < cur_qSplitSize; ++row) {
+        // Row sums for full masked out rows are 0, we set them to 1
+        // in order to avoid NaNs in the output and instead set fully
+        // masked out rows to 0
+        qk_max_data[row] = qk_max_data[row] == -std::numeric_limits<accum_t>::infinity() ? 0 : qk_max_data[row];
+        qk_sum_data[row] = qk_sum_data[row] == 0 ? 1 : qk_sum_data[row];
+        accum_t sum_reciprocal = 1 / qk_sum_data[row];
+        at::vec::map<scalar_t>(
+            [sum_reciprocal, is_skip_kv](Vec x) { return  is_skip_kv ? Vec(0.0) : x * Vec(sum_reciprocal); },
+            out_data + i * oStrideB + j * oStrideH + m * oStrideM + row * oStrideM,
+            dst_data + row * headSize_v,
+            headSize_v);
+      }
+      // Move to the next query
+      at::native::data_index_step(i, batchSize, j, num_head, k, qSlice);
+    }
+    at::native::cpublas::brgemm_release(need_pack);
+  });
+}
+"""
+class CppFlexAttentionTemplate(CppTemplate):
+    def __init__(
+        self,
+        input_nodes,
+        layout: ir.Layout,
+        scale,
+        score_mod,
+        mask_mod,
+        kv_block_size,
+        q_block_size,
+        has_other_buffer,
+        no_full_kv_block,
+        fake_buffers,
+        len_score_other,
+        len_mask_other,
+        kernel_input_name_to_buffer,
+        block_vars,
+    ) -> None:
+        assert layout.dtype in [torch.float, torch.bfloat16, torch.float16]
+        super().__init__("flex_attention", input_nodes, layout, parallel_num_threads())
+        self.scale = scale
+        self.score_mod = score_mod
+        self.mask_mod = mask_mod
+        self.score_buf_name = (
+            V.graph.register_buffer(self.score_mod) if self.score_mod else None
+        )
+        self.mask_buf_name = (
+            V.graph.register_buffer(self.mask_mod) if self.mask_mod else None
+        )
+        def get_idx(buf_name):
+            match = re.search(r"\d+", buf_name)
+            assert match, f"incorrect score buf name: {buf_name}"
+            return match.group()
+        self.score_buf_idx = (
+            get_idx(self.score_buf_name) if self.score_buf_name else None
+        )
+        self.mask_buf_idx = get_idx(self.mask_buf_name) if self.mask_buf_name else None
+        self.kv_block_size = kv_block_size
+        self.q_block_size = q_block_size
+        self.has_other_buffer = has_other_buffer
+        self.no_full_kv_block = no_full_kv_block
+        self.other_buffer_input_offset = 2
+        if self.no_full_kv_block:
+            self.other_buffer_input_offset = 0
+        self.fake_buffers = fake_buffers
+        self.len_score_other = len_score_other
+        self.len_mask_other = len_mask_other
+        self.kernel_input_name_to_buffer = kernel_input_name_to_buffer
+        self.block_vars = block_vars
+        self.extra_sizevars = list(
+            OrderedSet(
+                val
+                for val in self.kernel_input_name_to_buffer.values()
+                if isinstance(val, sympy.Symbol)
+            )
+        )
+        self.other_buf_start_idx = 5
+        self.score_mod_other_buffers = (
+            self.input_nodes[
+                self.other_buf_start_idx
+                + self.other_buffer_input_offset : self.other_buf_start_idx
+                + self.other_buffer_input_offset
+                + self.len_score_other
+            ]
+            if self.has_other_buffer
+            else None
+        )
+        self.mask_mod_other_buffers = (
+            self.input_nodes[
+                self.other_buf_start_idx
+                + self.other_buffer_input_offset
+                + self.len_score_other :
+            ]
+            if self.has_other_buffer
+            else None
+        )
+        self.other_ptr_data = {}  # type: ignore[var-annotated]
+    def update_kernel_args(self, kernel_args):
+        kernel_args.update(
+            {
+                key: value
+                for key, value in self.kernel_input_name_to_buffer.items()
+                if not isinstance(value, sympy.Symbol)
+            }
+        )
+        return kernel_args
+    def generate_other_buffer(self, buf_list, start_offset, len_attr, kernel_args):
+        kernel_input_name_to_buffer_name = {
+            key: value if isinstance(value, sympy.Symbol) else value.get_name()
+            for key, value in self.kernel_input_name_to_buffer.items()
+        }
+        def get_arg(name):
+            return kernel_input_name_to_buffer_name.get(name)
+        def get_arg_name(name):
+            if isinstance(get_arg(name), sympy.Symbol):
+                return kernel_args.sizevars.get(get_arg(name))
+            return kernel_args.input_buffers.get(get_arg(name))
+        if not self.has_other_buffer:
+            return ""
+        if start_offset == -1:
+            start_offset = getattr(self, len_attr)
+        length = getattr(self, len_attr)
+        for i in range(length):
+            pointer = f"in_ptr{self.other_buf_start_idx + start_offset + i}"
+            buffer_key = f"{buf_list}_{i}"
+            if pointer not in self.other_ptr_data:
+                self.other_ptr_data[pointer] = (
+                    get_arg_name(buffer_key),
+                    get_arg(buffer_key),
+                )
+        return "\n".join(
+            f"auto {ptr} = {name};" for ptr, (name, _) in self.other_ptr_data.items()
+        )
+    def modification(self, subgraph_buffer, output_name, output_idx):
+        assert isinstance(subgraph_buffer, ir.ComputedBuffer)
+        subgraph_buffer_data = subgraph_buffer.data
+        from ..loop_body import LoopBody
+        from ..utils import sympy_index_symbol_with_prefix, SymT
+        from ..virtualized import V
+        from .cpp import CppKernelProxy, KernelGroup
+        kernel_group = KernelGroup()
+        kernel_input_args = {
+            "score": "in_ptr0",
+            "b": "in_ptr1",
+            "h": "in_ptr2",
+            "q_idx": "in_ptr3",
+            "kv_idx": "in_ptr4",
+        }
+        if self.has_other_buffer:
+            kernel_input_args.update(
+                {arg: ptr for ptr, (_, arg) in self.other_ptr_data.items()}
+            )
+        kernel_output_args = {output_name: f"out_ptr{output_idx}"}
+        args = kernel_group.args
+        for name, inp in kernel_input_args.items():
+            args.input_buffers[name] = inp
+        for name, inp in kernel_output_args.items():
+            args.output_buffers[name] = inp
+        for name in self.extra_sizevars:
+            args.sizevars[name] = f"k{name}"
+        kernel_group.args = args
+        cpp_kernel_proxy = CppKernelProxy(kernel_group)
+        bodies = []
+        var_sizes_list = []
+        var_sizes = tuple(subgraph_buffer.get_size())
+        var_ranges = {
+            sympy_index_symbol_with_prefix(SymT.INDEX, i): sz
+            for i, sz in enumerate(var_sizes)
+        }
+        dst_layout = subgraph_buffer.get_layout()
+        output_index = dst_layout.make_indexer()([*var_ranges.keys()])
+        def fn(*args):
+            V.ops.store(
+                output_name,
+                output_index,
+                subgraph_buffer_data.make_loader()(args).value,
+            )
+        body = LoopBody(
+            fn,
+            (list(var_ranges.keys())),
+            var_ranges,
+            list(var_ranges.keys()),
+            tuple(),
+        )
+        from ..loop_body import MemoryUsageType
+        assert all(
+            mem.buffer_name in kernel_group.args.input_buffers
+            for mem in body.memory_usage[MemoryUsageType.LOAD]
+        ), (
+            "All the buffers in the score and mask subgraph should be in kernel_group.args.input_buffers"
+        )
+        bodies.append(body)
+        var_sizes_list.append((var_sizes, ()))
+        cpp_kernel_proxy.codegen_loop_bodies(bodies, var_sizes_list)
+        kernel_group.finalize_kernel(cpp_kernel_proxy, [])
+        output_code = kernel_group.loops_code.getvalue()
+        var_q_symbol, var_kv_symbol = self.block_vars
+        # See [Note] Handle the case where the split sizes are not statically known.
+        # We don't know the value of qBlockSize and rkvBlockSize during compilation time
+        # thus we've represented them by symbols.
+        # We change the symbol strings back to "cur_qSplitSize" and "cur_kvSplitSize"
+        # in the generated code thus they'll be filled with the real value during runtime.
+        if var_q_symbol in kernel_group.args.sizevars:
+            output_code = output_code.replace(
+                kernel_group.args.sizevars[var_q_symbol], "cur_qSplitSize"
+            )
+        if var_kv_symbol in kernel_group.args.sizevars:
+            output_code = output_code.replace(
+                kernel_group.args.sizevars[var_kv_symbol], "cur_kvSplitSize"
+            )
+        return output_code
+    @staticmethod
+    def add_choices(
+        choices,
+        input_nodes,
+        layout,
+        scale,
+        score_mod,
+        mask_mod,
+        kv_block_size,
+        q_block_size,
+        has_other_buffer,
+        no_full_kv_block,
+        fake_buffers,
+        len_score_other,
+        len_mask_other,
+        kernel_input_name_to_buffer,
+        block_vars,
+    ):
+        def preprocessor(input_nodes, layout):
+            return input_nodes, layout
+        def postprocessor(output):
+            return output
+        template = DataProcessorTemplateWrapper(
+            CppFlexAttentionTemplate,
+            preprocessor,
+            postprocessor,
+            input_nodes=input_nodes,
+            layout=layout,
+            scale=scale,
+            score_mod=score_mod,
+            mask_mod=mask_mod,
+            kv_block_size=kv_block_size,
+            q_block_size=q_block_size,
+            has_other_buffer=has_other_buffer,
+            no_full_kv_block=no_full_kv_block,
+            fake_buffers=fake_buffers,
+            len_score_other=len_score_other,
+            len_mask_other=len_mask_other,
+            kernel_input_name_to_buffer=kernel_input_name_to_buffer,
+            block_vars=block_vars,
+        )
+        template.maybe_append_choice(choices)
+        return template
+    def apply_score_mod(self, score, b, h, q_idx, kv_idx):
+        return self.score_mod.graph_module(score, b, h, q_idx, kv_idx).item()
+    def render(  # type: ignore[override,return]
+        self,
+        kernel,
+        template_buffer_node: Optional[ir.CppTemplateBuffer] = None,
+        epilogue_nodes: Optional[list[ir.IRNode]] = None,
+        **kwargs,
+    ) -> str:
+        if epilogue_nodes is not None and epilogue_nodes != []:
+            raise NotImplementedError(
+                "Unsupported for `epilogue_nodes` in CppFlexAttentionTemplate."
+            )
+        # Query (Batch x Num_heads  x Q_seq_len  x Dim_per_head)
+        #     -> (Batch x Q_seq_len  x Num_heads  x Dim_per_head)
+        #  Key   (Batch x Num_heads  x KV_seq_len x Dim_per_head)
+        #     -> (Batch x KV_seq_len x Num_heads  x Dim_per_head)
+        #  Value (Batch x Num_heads  x KV_seq_len x Dim_per_head)
+        #     -> (Batch x KV_seq_len x Num_heads  x Dim_per_head)
+        query = kernel.permute(self.input_nodes[0], [0, 2, 1, 3])
+        key = kernel.permute(self.input_nodes[1], [0, 2, 1, 3])
+        value = kernel.permute(self.input_nodes[2], [0, 2, 1, 3])
+        self.accumulate_dtype = torch.float
+        self.input_dtype = query.layout.dtype
+        num_threads = parallel_num_threads()
+        buf_out = TensorBox.create(self.output_node)
+        if template_buffer_node is not None:
+            buf_out = template_buffer_node
+        options = dict(
+            query=query,
+            key=key,
+            value=value,
+            kv_num_blocks=self.input_nodes[3],
+            kv_indices=self.input_nodes[4],
+            full_kv_num_blocks=self.input_nodes[5]
+            if not self.no_full_kv_block
+            else None,
+            full_kv_indices=self.input_nodes[6] if not self.no_full_kv_block else None,
+            score_mod_other_buffers=self.score_mod_other_buffers,
+            mask_mod_other_buffers=self.mask_mod_other_buffers,
+            scale=self.scale,
+            has_full_kv_block=not self.no_full_kv_block,
+            accumulate_dtype=self.accumulate_dtype,
+            query_dtype=self.input_dtype,
+            kvBlockSize=self.kv_block_size,
+            qBlockSize=self.q_block_size,
+            template=self,
+            output=buf_out,
+            kernel=kernel,
+            num_thread=num_threads,
+            score_mod=self.score_mod,
+            mask_mod=self.mask_mod,
+            score_buf_name=self.score_buf_name,
+            mask_buf_name=self.mask_buf_name,
+            score_buf_idx=self.score_buf_idx,
+            mask_buf_idx=self.mask_buf_idx,
+        )
+        with contextlib.ExitStack() as stack:
+            for buf in self.fake_buffers:
+                stack.enter_context(
+                    patch.object(V.graph, "get_dtype", self._fake_get_dtype(buf))
+                )
+            return self._template_from_string(FLEX_ATTENTION_TEMPLATE).render(**options)
+    def codegen_softmax_fusion(self, kernel_name: str):
+        # TODO: use inductor IR to rewrite those fusions
+        return self._template_from_string(SOFTMAX_FUSIONS).render(
+            dict(kernel_name=kernel_name)
+        )
+    def codegen_brgemm_pack_function(self, kernel_name: str):
+        # TODO: make them general for common bmm templates
+        return self._template_from_string(BRGEMM_PACK_FUNCTIONS).render(
+            dict(kernel_name=kernel_name)
+        )
+    def codegen_allocate_buffer(self, buffer_name: str, buffer_dtype, buffer_size):
+        return self._template_from_string(ALLOCATE_BUFFER).render(
+            dict(
+                buffer_name=buffer_name,
+                buffer_dtype=buffer_dtype,
+                buffer_size=buffer_size,
+            )
+        )
+    def micro_gemm_define(self, kernel_name: str):
+        from torch._inductor.codegen.cpp_gemm_template import (
+            CppTemplateKernel,
+            parallel_num_threads,
+        )
+        from torch._inductor.codegen.cpp_micro_gemm import CppMicroGemmFP32Vec
+        from torch._inductor.virtualized import V
+        micro_gemm_trans = CppMicroGemmFP32Vec(
+            kernel_name + "_kernel_micro_gemm_transpose_b",
+            self.input_dtype,
+            self.input_dtype,
+            self.accumulate_dtype,
+            self.accumulate_dtype,
+            GemmBlocking(1, 16, 1),
+            1,
+            True,
+            True,
+        )
+        micro_gemm = CppMicroGemmFP32Vec(
+            kernel_name + "_kernel_micro_gemm",
+            self.input_dtype,
+            self.input_dtype,
+            self.accumulate_dtype,
+            self.accumulate_dtype,
+            GemmBlocking(1, 16, 1),
+            1,
+            True,
+            False,
+        )
+        with V.set_graph_handler(V.graph):
+            kernel = CppTemplateKernel("cpp_micro_gemm", parallel_num_threads())
+            code_trans = micro_gemm_trans.codegen_define(kernel)
+            code = micro_gemm.codegen_define(kernel)
+        return code + code_trans
+    def codegen_micro_gemm(self, kernel_name: str):
+        micro_gemm = self.micro_gemm_define(kernel_name)
+        GEMM_SOURCE_CODE = MICRO_GEMM_TEMPLATE.replace("GEMM_DEFINE", micro_gemm)
+        return self._template_from_string(GEMM_SOURCE_CODE).render()

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_gemm_template.py ADDED Viewed

	@@ -0,0 +1,1777 @@

+# mypy: allow-untyped-defs
+import contextlib
+import logging
+import math
+from functools import lru_cache
+from typing import Any, Callable, cast, Optional, TypeVar, Union
+from unittest.mock import patch
+import torch
+import torch.utils
+from torch.utils._ordered_set import OrderedSet
+from ..._dynamo.utils import counters
+from .. import config, ir, lowering as L
+from ..kernel.mm_common import mm_args
+from ..select_algorithm import DataProcessorTemplateWrapper
+from ..utils import (
+    has_free_symbols,
+    is_same_mkldnn_tensor,
+    is_same_tensor,
+    parallel_num_threads,
+)
+from ..virtualized import ops, V
+from .cpp import get_export_declaration
+from .cpp_micro_gemm import (
+    CppMicroBrgemm,
+    CppMicroGemm,
+    CppMicroGemmAMX,
+    CppMicroGemmFP32Vec,
+    create_micro_gemm,
+    is_int8_woq_gemm_small_m_dim_corner_case,
+    LayoutType,
+)
+from .cpp_template import CppTemplate
+from .cpp_template_kernel import CppTemplateKernel
+from .cpp_utils import (
+    create_epilogue_with_attr,
+    DTYPE_TO_CPP,
+    GemmBlocking,
+    get_gemm_template_output_and_compute_dtype,
+)
+log = logging.getLogger(__name__)
+GEMM_TEMPLATE_INIT_BLOCKING_BASIC_BLOCK = r"""
+    constexpr int64_t num_threads = {{num_threads}};
+    constexpr int64_t N = {{N}};
+    constexpr int64_t K = {{K}};
+    constexpr int64_t Mr = {{micro_gemm.register_blocking.block_m}};
+    constexpr int64_t Nr = {{micro_gemm.register_blocking.block_n}};
+    constexpr int64_t Kr = {{micro_gemm.register_blocking.block_k}};
+    constexpr int64_t Nr_blocks = (N + Nr - 1) / Nr;
+    constexpr int64_t Kr_blocks = (K + Kr - 1) / Kr;
+{%- if is_dynamic_M %}
+    const int64_t M = {{kernel.size(GemmOut, 0)}};
+    const int64_t Mr_blocks = (M + Mr - 1) / Mr;
+{%- else %}
+    constexpr int64_t M = {{kernel.size(GemmOut, 0)}};
+    constexpr int64_t Mr_blocks = (M + Mr - 1) / Mr;
+{%- endif %}
+"""
+GEMM_TEMPLATE_INIT_BLOCKING_EXTENDED = r"""
+{%- if is_dynamic_M %}
+    {%- if num_threads > 1 %}
+    int64_t Mt_blocks, Nt_blocks, Kt_blocks;
+    mm_get_thread_blocking(num_threads, {{config.cpp.gemm_max_k_slices}}, M, N, K, Mr, Nr, Kr, Mt_blocks, Nt_blocks, Kt_blocks);
+    {%- else %}
+    const auto Mt_blocks = Mr_blocks;
+    const auto Nt_blocks = Nr_blocks;
+    const auto Kt_blocks = Kr_blocks;
+    {%- endif %}
+    int64_t Mc_blocks, Nc_blocks, Kc_blocks;
+    uint32_t L1_cache_size = {{L1_cache_size}};
+    uint32_t L2_cache_size = {{L2_cache_size}};
+    mm_get_cache_blocking<{{kernel.dtype(X)}}, {{kernel.dtype(W)}}>(
+        num_threads,
+        M,
+        N,
+        K,
+        Mr,
+        Nr,
+        Kr,
+        Mt_blocks,
+        Nt_blocks,
+        Kt_blocks,
+        Mc_blocks,
+        Nc_blocks,
+        Kc_blocks,
+        L1_cache_size,
+        L2_cache_size
+    );
+    const int64_t num_Mc_blocks = (Mr_blocks + Mc_blocks - 1) / Mc_blocks;
+    const int64_t num_Nc_blocks = (Nr_blocks + Nc_blocks - 1) / Nc_blocks;
+    const int64_t num_Mt_blocks = (Mr_blocks + Mt_blocks - 1) / Mt_blocks;
+    const int64_t num_Nt_blocks = (Nr_blocks + Nt_blocks - 1) / Nt_blocks;
+    const int64_t num_Kt_blocks = (Kr_blocks + Kt_blocks - 1) / Kt_blocks;
+{%- else %}
+    constexpr int64_t Mt_blocks = {{template.thread_blocking(num_threads).block_m}};
+    constexpr int64_t Nt_blocks = {{template.thread_blocking(num_threads).block_n}};
+    constexpr int64_t Kt_blocks = {{template.thread_blocking(num_threads).block_k}};
+    constexpr int64_t Mc_blocks = {{template.cache_blocking(num_threads).block_m}};
+    constexpr int64_t Nc_blocks = {{template.cache_blocking(num_threads).block_n}};
+    constexpr int64_t Kc_blocks = {{template.cache_blocking(num_threads).block_k}};
+    constexpr int64_t num_Mc_blocks = (Mr_blocks + Mc_blocks - 1) / Mc_blocks;
+    constexpr int64_t num_Nc_blocks = (Nr_blocks + Nc_blocks - 1) / Nc_blocks;
+    constexpr int64_t num_Mt_blocks = (Mr_blocks + Mt_blocks - 1) / Mt_blocks;
+    constexpr int64_t num_Nt_blocks = (Nr_blocks + Nt_blocks - 1) / Nt_blocks;
+    constexpr int64_t num_Kt_blocks = (Kr_blocks + Kt_blocks - 1) / Kt_blocks;
+{%- endif %}
+{%- if is_woq_int4 %}
+    int64_t group_size = *q_group_size;
+{%- endif %}
+    // make sure all partitions are assigned
+    {{kernel.assert_function}}(
+        Mt_blocks * Nt_blocks * Kt_blocks * {{num_threads}} >= Mr_blocks * Nr_blocks * Kr_blocks,
+        "Not all partitions are assigned."
+    );
+"""
+GEMM_TEMPLATE_MULTI_THREADS_PARAMS = r"""
+const int tid = omp_get_thread_num();
+const int64_t k_group_id = tid / num_Kt_blocks;
+const int64_t k_slice_id = tid % num_Kt_blocks;
+const int64_t n_group_id = k_group_id / num_Nt_blocks;
+const int64_t n_slice_id = k_group_id % num_Nt_blocks;
+const int64_t k_block_start = k_slice_id * Kt_blocks;
+const int64_t k_block_end = std::min(k_block_start + Kt_blocks, Kr_blocks);
+const int64_t n_block_start = n_slice_id * Nt_blocks;
+const int64_t n_block_end = std::min(n_block_start + Nt_blocks, Nr_blocks);
+const int64_t m_block_start = std::min(n_group_id * Mt_blocks, Mr_blocks);
+const int64_t m_block_end = std::min(m_block_start + Mt_blocks, Mr_blocks);
+const int64_t num_Mc_blocks_per_thread = (m_block_end - m_block_start + Mc_blocks - 1) / Mc_blocks;
+"""
+GEMM_TEMPLATE_SINGLE_THREAD_PARAMS = r"""
+constexpr int tid = 0;
+constexpr int64_t k_group_id = 0;
+constexpr int64_t k_slice_id = 0;
+constexpr int64_t n_group_id = 0;
+constexpr int64_t n_slice_id = 0;
+constexpr int64_t m_block_start = 0;
+constexpr int64_t n_block_start = 0;
+constexpr int64_t n_block_end = Nr_blocks;
+constexpr int64_t k_block_start = 0;
+constexpr int64_t k_block_end = Kr_blocks;
+{%- if is_dynamic_M %}
+const int64_t num_Mc_blocks_per_thread = num_Mc_blocks;
+const int64_t m_block_end = Mr_blocks;
+{%- else %}
+constexpr int64_t num_Mc_blocks_per_thread = num_Mc_blocks;
+constexpr int64_t m_block_end = Mr_blocks;
+{%- endif %}
+"""
+GEMM_TEMPLATE_M_LOOP_PARAMS = r"""
+const int64_t my_mc_block_id = (mc_block_id + n_slice_id) % num_Mc_blocks_per_thread;
+const int64_t mc = m_block_start + my_mc_block_id * Mc_blocks;
+const int64_t m_start = mc * Mr;
+const int64_t m_end = std::min(std::min(mc + Mc_blocks, m_block_end) * Mr, M);
+const int64_t m_size = m_end - m_start;
+"""
+GEMM_TEMPLATE_N_LOOP_PARAMS = r"""
+const int64_t n_start = nc * Nr;
+const int64_t n_end = std::min(std::min(nc + Nc_blocks, n_block_end) * Nr, N);
+const int64_t n_size = n_end - n_start;
+// NB: assume we pad N, nc_block_end won't exceed padded N here.
+const int64_t nc_block_end = std::min(nc + Nc_blocks, n_block_end);
+"""
+GEMM_TEMPLATE_MICROKERNEL_DEF = r"""
+{{template.header().getvalue()}}
+{{micro_gemm.codegen_define(kernel)}}
+"""
+GEMM_TEMPLATE_STUB_DEF = r"""
+{%- if x_scale is not none %}
+    {%- set kernel_args = {"X": X, "W": W, "inp": inp, "x_scale": x_scale, "x_zp": x_zp, "w_scale": w_scale, "w_zp": w_zp,} %}
+{%- elif is_woq_int4 %}
+    {%- set kernel_args = {"X": X, "W": W, "q_group_size": q_group_size, "qscale_and_zeros": qscale_and_zeros} %}
+{%- else %}
+    {%- set kernel_args = {"X": X, "W": W, "inp": inp} %}
+{%- endif %}
+extern "C" {{export_declaration}}
+{{kernel.def_kernel(inputs=kernel_args, outputs={"Y": Y}, aliases=aliases)}}
+"""
+GEMM_TEMPLATE = r"""
+{{ template.codegen_gemm_stub_def() }}
+{
+    {{ kernel.maybe_codegen_profile() }}
+    {{ template.codegen_blocks(
+        num_threads, N, K, micro_gemm, is_dynamic_M, kernel, GemmOut, config, L1_cache_size, L2_cache_size, X, W
+    ) }}
+{%- if maybe_k_slicing %}
+    std::unique_ptr<std::unique_ptr<{{DTYPE_TO_CPP[acc_buf_dtype]}}[]>[]> local_buf_ptrs;
+    if (num_Kt_blocks > 1) {
+        local_buf_ptrs.reset(new std::unique_ptr<{{DTYPE_TO_CPP[acc_buf_dtype]}}[]>[num_Mc_blocks * num_Nc_blocks * num_Kt_blocks]);
+    }
+{%- endif %}
+{%- if num_threads > 1 %}
+    #pragma omp parallel num_threads({{num_threads}})
+    {
+        {{ template.codegen_multi_threads_params()|indent(8, false) }}
+{%- else %}
+    {
+        {{ template.codegen_single_thread_params(is_dynamic_M)|indent(8, false) }}
+{%- endif %}
+        {{ micro_gemm.codegen_init(kernel) }}
+{%- if use_local_acc %}
+    {%- set acc_buf_name = "local_acc_buf" %}
+        {{ kernel.define_buffer(acc_buf_name, ["Mc_blocks*Mr", "Nc_blocks*Nr"], acc_buf_dtype) }}
+{%- endif %}
+        for (int64_t mc_block_id = 0; mc_block_id < num_Mc_blocks_per_thread; mc_block_id++) {
+            {{ template.codegen_m_loop_params()|indent(12, false) }}
+            for (int64_t nc = n_block_start; nc < n_block_end; nc += Nc_blocks) {
+                {{ template.codegen_n_loop_params()|indent(16, false) }}
+{%- if use_local_acc %}
+    {%- set acc = kernel.local_buffers[acc_buf_name] %}
+                {{ kernel.reinit_buffer_if_null(acc_buf_name) }}
+{%- else %}
+    {%- set acc = kernel.slice_nd(GemmOut, [("m_start", "m_end"), ("n_start", "n_end")]) %}
+{%- endif %}
+                for (int64_t kc = k_block_start; kc < k_block_end; kc += Kc_blocks) {
+                    int64_t k_start = kc * Kr;
+                    int64_t k_end = std::min(std::min(kc + Kc_blocks, k_block_end) * Kr, K);
+{%- set tile_X = kernel.slice_nd(X, [("m_start", "m_end"), ("k_start", "k_end")]) %}
+                    for (int64_t nci = nc; nci < nc_block_end; nci++) {
+{%- set acc_slice = kernel.slice_nd(acc, [("0", "m_end - m_start"), ("(nci - nc)*Nr", "(nci - nc + 1)*Nr")]) %}
+{%- if template.should_block_weights and not is_woq_int4 %}
+{%- set tile_W_3d = kernel.slice_nd(W, [("nci", "nci + 1"), ("k_start", "k_end"), ()]) %}
+{%- set tile_W = kernel.view(tile_W_3d, ["k_end - k_start", micro_gemm.register_blocking.block_n]) %}
+{%- else %}
+    {%- if is_woq_int4 %}
+        {%- set tile_W = kernel.slice_nd(W, [("nci * Nr", "(nci + 1) * Nr"), ("k_start * Nr / 2", "k_end * Nr / 2")]) %}
+        {%- set tile_qparam = kernel.slice_nd(
+            qscale_and_zeros, [("k_start // group_size", "k_end // group_size"), ("nci * Nr", "(nci + 1) * Nr"), ()]) %}
+    {%- else %}
+        {%- set tile_W = kernel.slice_nd(W, [("k_start", "k_end"), ("n_start", "n_start + n_size")]) %}
+        {%- set tile_qparam = None %}
+    {%- endif %}
+{%- endif %}
+                        if (kc == k_block_start) {
+                            {{ micro_gemm.codegen_call(kernel,
+                                                       tile_X,
+                                                       tile_W,
+                                                       acc_slice,
+                                                       accum=False,
+                                                       qscale_and_zeros=tile_qparam)|indent(28, false)
+                            }}
+                        } else {
+                            {{ micro_gemm.codegen_call(kernel,
+                                                       tile_X,
+                                                       tile_W,
+                                                       acc_slice,
+                                                       accum=True,
+                                                       qscale_and_zeros=tile_qparam)|indent(28, false)
+                            }}
+                        }
+                    }
+                }
+{%- if maybe_k_slicing %}
+                if (num_Kt_blocks > 1) {
+                    const int64_t mxn_cache_block_id = (mc / Mc_blocks) * num_Nc_blocks + nc;
+                    local_buf_ptrs[mxn_cache_block_id * num_Kt_blocks + k_slice_id].reset(
+                        {{ kernel.release_buffer(acc_buf_name) }});
+                } else
+{%- endif %}
+                {
+{%- set tile_Y = kernel.slice_nd(Y_2d, [("m_start", "m_end"), ("n_start", "n_end")]) %}
+{%- set tile_acc = kernel.slice_nd(acc, [("0", "m_end - m_start"), ("0", "n_end - n_start")]) %}
+                    {{ kernel.store_output(
+                        tile_Y, tile_acc, GemmOut, epilogue_nodes, offsets=("m_start", "n_start"), reindexers=reindexers
+                    )|indent(20, false)
+                    }}
+                }
+            }
+        }
+{%- if maybe_k_slicing %}
+        if (num_Kt_blocks > 1) {
+            #pragma omp barrier
+            for (int64_t mc = m_block_start; mc < m_block_end; mc += Mc_blocks) {
+                // We slice M-dim and each thread in the k-slicing group works on a slice
+                const int64_t m_start_unsliced = mc * Mr;
+                const int64_t m_end_unsliced = std::min(std::min(mc + Mc_blocks, m_block_end) * Mr, M);
+                const int64_t m_size_unsliced = m_end_unsliced - m_start_unsliced;
+                const int64_t m_slice_size = (m_size_unsliced + num_Kt_blocks - 1) / num_Kt_blocks;
+                const int64_t m_start = std::min(m_start_unsliced + m_slice_size * k_slice_id, m_end_unsliced);
+                const int64_t m_end = std::min(m_start_unsliced + m_slice_size * (k_slice_id + 1), m_end_unsliced);
+                const int64_t m_size = m_end - m_start;
+                const int64_t m_offset = m_start - m_start_unsliced;
+                for (int64_t nc = n_block_start; nc < n_block_end; nc += Nc_blocks) {
+                    const int64_t n_start = nc * Nr;
+                    const int64_t n_end = std::min(std::min(nc + Nc_blocks, n_block_end) * Nr, N);
+                    const int64_t n_size = n_end - n_start;
+                    const int64_t mxn_cache_block_id = (mc / Mc_blocks) * num_Nc_blocks + nc;
+                    auto {{acc_buf_name}} = local_buf_ptrs[mxn_cache_block_id * num_Kt_blocks].get();
+                    for (int64_t other_slice = 1; other_slice < num_Kt_blocks; other_slice++) {
+                        auto other_acc = local_buf_ptrs[mxn_cache_block_id * num_Kt_blocks + other_slice].get();
+                        for (int64_t m = m_offset; m < m_offset + m_size; m++) {
+                            #pragma omp simd
+                            for (int64_t n = 0; n < n_size; n++) {
+                                {{acc_buf_name}}[m*Nr + n] += other_acc[m*Nr + n];
+                            }
+                        }
+                    }
+    {%- set tile_acc_m_slice = kernel.slice_nd(tile_acc, [("m_offset", "m_offset + m_end - m_start"), ()]) %}
+                    {{ kernel.store_output(
+                        tile_Y, tile_acc_m_slice, GemmOut, epilogue_nodes, offsets=("m_start", "n_start"), reindexers=reindexers
+                    )|indent(20, false)
+                    }}
+                }
+            }
+        }
+{%- endif %}
+        {{ micro_gemm.codegen_finalize(kernel) }}
+    }
+}
+"""
+SMALL_M_GEMM_TEMPLATE = r"""
+{{ template.codegen_gemm_stub_def() }}
+{
+    {{ kernel.maybe_codegen_profile() }}
+    {{ template.codegen_blocks(
+        num_threads, N, K, micro_gemm, is_dynamic_M, kernel, GemmOut, config, L1_cache_size, L2_cache_size, X, W
+    ) }}
+    # pragma omp parallel
+    {
+        #pragma omp for nowait
+        for (int64_t nr_block_id = 0; nr_block_id < Nr_blocks; nr_block_id++) {
+            // Handle one output M * Nr block in each thread
+            int64_t n_start = nr_block_id * Nr;
+            int64_t n_end = (nr_block_id + 1) * Nr;
+{%- if use_local_acc %}
+    {%- set acc_buf_name = "local_acc_buf" %}
+            {{ kernel.define_stack_allocated_buffer(acc_buf_name, ["M", "Nr"], acc_buf_dtype) }}
+    {%- set acc = kernel.local_buffers[acc_buf_name] %}
+{%- else %}
+    {%- set acc = kernel.slice_nd(GemmOut, [(0, "M"), ("n_start", "n_end")]) %}
+{%- endif %}
+            for (int64_t kr_block_id = 0; kr_block_id < Kr_blocks; kr_block_id++) {
+                // this loop is not parallelized
+                int64_t k_start = kr_block_id * Kr;
+                int64_t k_end = std::min((kr_block_id + 1) * Kr, K);
+{%- set tile_X = kernel.slice_nd(X, [(0, "M"), ("k_start", "k_end")]) %}
+{%- set tile_W_3d = kernel.slice_nd(W, [("nr_block_id", "nr_block_id + 1"), ("k_start", "k_end"), ()]) %}
+{%- set tile_W = kernel.view(tile_W_3d, ["k_end - k_start", micro_gemm.register_blocking.block_n]) %}
+                if C10_UNLIKELY(kr_block_id == 0) {
+                    {{ micro_gemm.codegen_call(kernel, tile_X, tile_W, acc, accum=False, prefetch=True)|indent(20, false) }}
+                } else if C10_UNLIKELY(k_end == K) {
+                    {{ micro_gemm.codegen_call(kernel, tile_X, tile_W, acc, accum=True, prefetch=False)|indent(20, false) }}
+                } else {
+                    {{ micro_gemm.codegen_call(kernel, tile_X, tile_W, acc, accum=True, prefetch=True)|indent(20, false) }}
+                }
+            }
+{%- set tile_Y = kernel.slice_nd(Y_2d, [("0", "M"), ("n_start", "n_end")]) %}
+{%- set tile_acc = kernel.slice_nd(acc, [("0", "M"), ("0", "n_end - n_start")]) %}
+            {{ kernel.store_output(
+                tile_Y, tile_acc, GemmOut, epilogue_nodes, offsets=("0", "n_start"), reindexers=reindexers
+            )|indent(20, false) }}
+        }
+    }
+}
+"""
+def _is_int8_gemm(inputs):
+    return (
+        isinstance(inputs[0], ir.IRNode)
+        and inputs[0].get_dtype() in [torch.uint8, torch.int8]
+    ) or (
+        isinstance(inputs[0], torch.Tensor)
+        and inputs[0].dtype in [torch.uint8, torch.int8]
+    )
+def get_padded_n(n, block_n):
+    return (n + block_n - 1) // block_n * block_n
+_T = TypeVar("_T", ir.IRNode, torch.Tensor)
+def transpose_w(W: _T, trans_w: bool) -> _T:
+    """
+    Transpose W based on the trans_w flag.
+    """
+    if isinstance(W, ir.IRNode):
+        if trans_w:
+            if not isinstance(W, ir.TensorBox):
+                W = ir.TensorBox(W)
+            W = L.permute(W, [1, 0])
+    else:
+        if trans_w:
+            assert isinstance(W, torch.Tensor)
+            W = W.transpose(0, 1)
+    return W
+def expand_bias(B: Optional[_T], X: _T) -> Optional[_T]:
+    """
+    Expand Bias to the same size of X.
+    """
+    if B is not None:
+        if isinstance(B, ir.IRNode):
+            if not isinstance(B, ir.TensorBox):
+                B = ir.TensorBox(B)
+            assert hasattr(X, "get_size")
+            B = L.expand(B, (X.get_size()[0], B.get_size()[-1]))
+        else:
+            assert isinstance(B, torch.Tensor)
+            assert isinstance(X, torch.Tensor)
+            B = B.expand(X.shape[0], B.shape[-1])
+    return B
+def prune_tensors(input_nodes: list[ir.IRNode], new_input_nodes: list[ir.IRNode]):
+    """
+    Prune unused tensors from `V.graph` since the GEMM Template use new packed weight.
+    """
+    def share_storage(base_tensor: torch.Tensor, comp_tensor: torch.Tensor):
+        return base_tensor.is_mkldnn == comp_tensor.is_mkldnn and (
+            is_same_tensor(base_tensor, comp_tensor)
+            or is_same_mkldnn_tensor(base_tensor, comp_tensor)
+        )
+    def get_candidates(input_nodes, new_input_nodes):
+        # Only Constant Buffer like weight and bias might be changed in GEMM Template.
+        # The Inductor IR Node may changed, but still share the storage. For example:
+        # bias in bfloat16 case which only do the expand
+        return [
+            node
+            for node in input_nodes
+            if (
+                node not in new_input_nodes
+                and isinstance(node, (ir.TensorBox, ir.StorageBox))
+                and node.get_name() in V.graph.constants
+                and not any(
+                    (
+                        isinstance(new_node, (ir.TensorBox, ir.StorageBox))
+                        and new_node.get_name() in V.graph.constants
+                        and share_storage(
+                            V.graph.constants[node.get_name()],
+                            V.graph.constants[new_node.get_name()],
+                        )
+                    )
+                    for new_node in new_input_nodes
+                )
+            )
+        ]
+    for candidate_node in get_candidates(input_nodes, new_input_nodes):
+        # By using the new packed weight for the GEMM template, we can prune the
+        # old weight if it has no other users. This saves memory but makes the FX graph
+        # non-retraceable. To support retracing, we can add a repack node to the
+        # FX graph. For example:
+        # mkldnn._linear_pointwise <- repack_linear_wgt <- packed_wgt_for_template
+        candidate_tensor_users = 0
+        candidate_tensor = V.graph.constants[candidate_node.get_name()]
+        for node in reversed(V.graph.graph.nodes):
+            # Case may happen when the candidate tensor is used by more than 1 get_attr node
+            # https://github.com/pytorch/pytorch/issues/134998
+            if node.op == "get_attr" and hasattr(
+                V.graph.module, node.target
+            ):  # candidate tensor might already be deleted
+                comp_tensor = getattr(V.graph.module, node.target)
+                if isinstance(comp_tensor, torch.Tensor) and share_storage(
+                    candidate_tensor, comp_tensor
+                ):
+                    candidate_tensor_users += 1
+        for node in reversed(V.graph.graph.nodes):
+            # The get_attr node has only 1 user fx node
+            # The candidate tensor has been used by only 1 get_attr node
+            if (
+                node.op == "get_attr"
+                and node.target == candidate_node.get_name()
+                and len(node.users) == 1
+                and candidate_tensor_users == 1
+            ):
+                del V.graph.constants[node.target]
+                delattr(V.graph.module, node.target)
+                delattr(V.graph.graph.owning_module, node.target)
+                counters["inductor"]["select_algorithm_weight_prune"] += 1
+def gen_2d_view_of_epilogue_buf(
+    Y: ir.Buffer,
+    template_buffer: ir.Buffer,
+    epilogue_nodes: list[ir.IRNode],
+    reindexers: list[Optional[Callable[[list[Any]], list[Any]]]],
+    default_reindexers: list[Optional[Callable[[list[Any]], list[Any]]]],
+) -> tuple[
+    Union[ir.Buffer, ir.ReinterpretView],
+    list[Optional[Callable[[list[Any]], list[Any]]]],
+]:
+    """
+    The dimension and the indexing could be different between the GEMM output, i.e. `template_buffer`, which is
+    2D with MxN) and the output from the template after epilogues, i.e. `Y`. In the GEMM template code,
+    we are not aware of the dimension and the indexing of the epilogues and always work on 2D tiles according to
+    the indexing of the GEMM output.
+    In this function, we return a 2D buffer (`Y_2d`) according to GEMM output (reinterpreted from `Y` if needed) and
+    build a reindexer that converts the indexing of `Y` into `Y_2d`.
+    """
+    Y_2d: Union[ir.Buffer, ir.ReinterpretView] = Y
+    if (
+        Y.get_size() == template_buffer.get_size()
+        and Y.get_stride() == template_buffer.get_stride()
+    ):
+        reindexers.extend(default_reindexers)
+        Y_2d = Y
+    else:
+        def get_reindexer(epilogue_node, default_reindexer=None):
+            # From template_buffer to epilogue_node_ordered (ordered by stride decreasingly, in dense format), for example:
+            #   template_buffer:
+            #       size (324, 512), stride (512, 1)
+            #   epilogue_node_ordered (ordered by stride decreasingly, in dense format):
+            #       size (1, 18, 18, 512), stride (165888, 9216, 512, 1)
+            stride_order = list(
+                ir.get_stride_order(
+                    V.graph.sizevars.size_hints(epilogue_node.get_stride())
+                )
+            )
+            fill_order = ir.stride_order2fill_order(stride_order)
+            reversed_fill_order = list(reversed(fill_order))
+            size_with_stride_ordered_decreasingly = [
+                epilogue_node.get_size()[i] for i in reversed_fill_order
+            ]
+            reshape_reindex = ir.View.dynamic_reshape_indexer(
+                size_with_stride_ordered_decreasingly,
+                template_buffer.get_size(),
+            )
+            if default_reindexer:
+                reshape_reindex = ir.fuse_reindexing(reshape_reindex, default_reindexer)
+            # From epilogue_node_ordered (ordered by stride decreasingly, in dense format) to epilogue_node, for example:
+            #   epilogue_node_ordered (ordered by stride decreasingly, in dense format):
+            #       size (1, 18, 18, 512), stride (165888, 9216, 512, 1)
+            #   epilogue_node:
+            #       size (1, 18, 18, 512), stride (165888, 1, 9216, 512)
+            from_stride_ordered_decreasingly_to_epilogue_node_order = [
+                (len(stride_order) - 1) - stride_order[i]
+                for i in range(len(stride_order))
+            ]
+            stride_reindex = ir.same_reorder(
+                from_stride_ordered_decreasingly_to_epilogue_node_order
+            )
+            reindexer = ir.fuse_reindexing(stride_reindex, reshape_reindex)  # type: ignore[var-annotated]
+            return reindexer
+        if default_reindexers is None:
+            default_reindexers = [None] * len(epilogue_nodes)
+        new_reindexers = [
+            get_reindexer(epilogue_node, default_reindexer)
+            for epilogue_node, default_reindexer in zip(
+                epilogue_nodes, default_reindexers
+            )
+        ]
+        reindexers.extend(new_reindexers)
+        if isinstance(Y, ir.BaseView):
+            storage = ir.StorageBox(Y.unwrap_view())
+        else:
+            assert isinstance(Y, ir.Buffer)
+            storage = ir.StorageBox(Y)
+        Y_2d = ir.ReinterpretView(data=storage, layout=template_buffer.get_layout())
+    return Y_2d, reindexers
+class CppGemmTemplate(CppTemplate):
+    """
+    GEMM Template for Inductor CPP Backend.
+    """
+    def __init__(
+        self,
+        input_nodes,
+        layout: ir.Layout,
+        num_threads: int,
+        register_blocking: GemmBlocking,
+        beta=1,
+        alpha=1,
+        has_bias=False,
+        epilogue_creator: Optional[Callable[[ir.Buffer], ir.Pointwise]] = None,
+        should_block_weights: bool = True,
+        name="packed_gemm",
+    ) -> None:
+        assert layout.dtype in [torch.float, torch.bfloat16, torch.half, torch.uint8]
+        super().__init__(
+            name,
+            input_nodes,
+            layout,
+            num_threads,
+            epilogue_creator=epilogue_creator,
+        )
+        self.beta = beta
+        self.alpha = alpha
+        self.has_bias = has_bias
+        self.register_blocking = register_blocking
+        m, n = layout.size[-2:]
+        k = input_nodes[0].get_size()[-1]
+        self.m, self.n, self.k = m, n, k
+        self.padded_n = get_padded_n(n, self.register_blocking.block_n)
+        self.is_dynamic_M = has_free_symbols((m,))
+        self.should_block_weights = should_block_weights
+        self.thread_blocking = self.make_thread_blocking_cache()
+        self.cache_blocking = self.make_cache_blocking_cache()
+    def make_thread_blocking_cache(self):
+        cache = lru_cache()(self._thread_blocking)
+        def thread_blocking(num_threads: int) -> GemmBlocking:
+            return cache(num_threads)
+        return thread_blocking
+    def _thread_blocking(self, num_threads: int) -> GemmBlocking:
+        """
+        NOTE [Thread blocking in Cpp GEMM]
+        We use simple heuristics to decide the thread blocking:
+        1. Make sure all threads are occupied as much as possible.
+        2. For (m, n) blocks, favor more square-sized thread blocks for better data reuse.
+        3. If (m, n) blocks cannot occupy all the threads, we consider k-slicing.
+        TODO(jgong5): allow tuning various blocking options
+        """
+        def get_factors(number):
+            factors = []
+            for i in range(int(number**0.5), 0, -1):
+                if number % i == 0:
+                    factors.append(number // i)
+                    factors.append(i)
+            return factors
+        def get_blocking(m_factor, n_factor, k_factor, m_blocks, n_blocks, k_blocks):
+            thread_block_k = math.ceil(k_blocks / k_factor)
+            thread_block_n = math.ceil(n_blocks / n_factor)
+            thread_block_m = math.ceil(m_blocks / m_factor)
+            return GemmBlocking(thread_block_m, thread_block_n, thread_block_k)
+        assert not self.is_dynamic_M, (
+            "Unable to determine thread blocking for dynamic M."
+        )
+        register_blocking = self.register_blocking
+        m_blocks = math.ceil(self.m / register_blocking.block_m)
+        n_blocks = math.ceil(self.n / register_blocking.block_n)
+        k_blocks = math.ceil(self.k / register_blocking.block_k)
+        factors = get_factors(num_threads)
+        assert len(factors) > 0
+        if config.cpp.gemm_thread_factors is not None:
+            factors = [int(i) for i in config.cpp.gemm_thread_factors.split(",")]
+            assert len(factors) == 3
+            assert math.prod(factors) == self.num_threads
+            return get_blocking(
+                factors[0], factors[1], factors[2], m_blocks, n_blocks, k_blocks
+            )
+        # we favor square-sized thread blocks for good data reuse
+        def get_better_blocking(blocking, best_blocking):
+            if best_blocking is None:
+                best_blocking = blocking
+            else:
+                block_m_size = blocking.block_m * register_blocking.block_m
+                block_n_size = blocking.block_n * register_blocking.block_n
+                best_block_m_size = best_blocking.block_m * register_blocking.block_m
+                best_block_n_size = best_blocking.block_n * register_blocking.block_n
+                if blocking.block_k > best_blocking.block_k:
+                    best_blocking = blocking
+                elif (
+                    blocking.block_k == best_blocking.block_k
+                    and block_m_size + block_n_size
+                    < best_block_m_size + best_block_n_size
+                ):
+                    best_blocking = blocking
+            return best_blocking
+        best_blocking = None
+        # check if we can have a thread-blocking to occupy all threads without k-slicing
+        for n_factor in factors:
+            m_factor = num_threads // n_factor
+            if n_blocks >= n_factor and m_blocks >= m_factor:
+                blocking = get_blocking(
+                    m_factor, n_factor, 1, m_blocks, n_blocks, k_blocks
+                )
+                best_blocking = get_better_blocking(blocking, best_blocking)
+        if best_blocking is None:
+            for k_factor in factors:
+                if k_blocks >= k_factor and (
+                    config.cpp.gemm_max_k_slices == 0
+                    or k_factor <= config.cpp.gemm_max_k_slices
+                ):
+                    n_factors = get_factors(num_threads // k_factor)
+                    for n_factor in n_factors:
+                        m_factor = (num_threads // k_factor) // n_factor
+                        if n_blocks >= n_factor and m_blocks >= m_factor:
+                            blocking = get_blocking(
+                                m_factor,
+                                n_factor,
+                                k_factor,
+                                m_blocks,
+                                n_blocks,
+                                k_blocks,
+                            )
+                            best_blocking = get_better_blocking(blocking, best_blocking)
+        if best_blocking is None:
+            for n_factor in factors:
+                m_factor = num_threads // n_factor
+                if n_blocks >= n_factor or m_blocks >= m_factor:
+                    blocking = get_blocking(
+                        m_factor, n_factor, 1, m_blocks, n_blocks, k_blocks
+                    )
+                    best_blocking = get_better_blocking(blocking, best_blocking)
+        assert best_blocking is not None
+        return best_blocking
+    def make_cache_blocking_cache(self):
+        cache = lru_cache()(self._cache_blocking)
+        def cache_blocking(num_threads: int) -> GemmBlocking:
+            return cache(num_threads)
+        return cache_blocking
+    def _cache_blocking(self, num_threads: int) -> GemmBlocking:
+        def get_cache_blocking(register_blocking, thread_blocking):
+            Mr = register_blocking.block_m
+            Nr = register_blocking.block_n
+            Kr = register_blocking.block_k
+            Mt_blocks = thread_blocking.block_m
+            Nt_blocks = thread_blocking.block_n
+            Kt_blocks = thread_blocking.block_k
+            if config.cpp.gemm_cache_blocking is not None:
+                blockings = [int(i) for i in config.cpp.gemm_cache_blocking.split(",")]
+                assert len(blockings) == 3
+                Mc_blocks, Nc_blocks, Kc_blocks = blockings
+                return (
+                    min(Mc_blocks, Mt_blocks),
+                    min(Nc_blocks, Nt_blocks),
+                    min(Kc_blocks, Kt_blocks),
+                )
+            # The ratios below are empirically determined to decide
+            # the effective sizes of L1 and L2.
+            # TODO: tune the factor here
+            L1_limit_factor = 0.8
+            L2_limit_factor = 0.5
+            L1_cache_size = (
+                torch._C._cpu._L1d_cache_size()
+            )  # per core cache size in Bytes
+            assert L1_cache_size > 0, (
+                f"Expect L1_cache_size > 0 but got {L1_cache_size}"
+            )
+            L1 = L1_cache_size * L1_limit_factor
+            L2_cache_size = (
+                torch._C._cpu._L2_cache_size()
+            )  # per core cache size in Bytes
+            assert L2_cache_size > 0, (
+                f"Expect L2_cache_size > 0 but got {L2_cache_size}"
+            )
+            L2 = L2_cache_size * L2_limit_factor
+            def get_num_byte(dtype):
+                return torch.tensor([], dtype=dtype).element_size()
+            dtype_A = self.input_nodes[0].get_dtype()
+            dtype_B = self.input_nodes[1].get_dtype()
+            num_byte_A = get_num_byte(dtype_A)
+            num_byte_B = get_num_byte(dtype_B)
+            if dtype_A is torch.bfloat16 and dtype_B is torch.int8 and Kr != 1:
+                # We will cache dequantized weights (BF16) in L1D for AMX micro-kernel.
+                # In this case, the choice of the micro-kernel being used can't be decoupled from
+                # the cache blocking.
+                # TODO: Decouple the choice of micro-kernel from cache blocking
+                num_byte_B *= num_byte_A
+            # NOTE [CPP GEMM Cache Blocking Algorithm]
+            # Our overall strategy is to
+            # 1) Make cache blocks of B L1-reside and reused by multiple rows of A, i.e. Mc.
+            #    Here, B is Kc x Nr where Nr is a single register block. We use L1 size to
+            #    decide Kc. We want to make Mc large enough to better reuse B.
+            # 2) Make cache blocks of A L2-reside, which would limit Mc. We want to reuse A
+            #    along N, where we have two sub-strategies (see notes below) to decide Mc and Nc.
+            # Step 1: Decide Kc assuming B block is L1-reside.
+            size_cache_B = Kr * Kt_blocks * Nr * num_byte_B
+            Kc_blocks = Kt_blocks
+            if size_cache_B > L1:
+                Kc_blocks = math.floor(L1 / (Kr * Nr * num_byte_B))
+            if (
+                config.cpp.use_small_dequant_buffer
+                and dtype_A is torch.bfloat16
+                and dtype_B is torch.uint8
+                and Mt_blocks == 1
+            ):
+                # Make a small dequant_B buffer for woq int4 [q_group_size, Nr]
+                # Since when Mt_blocks == 1, L1-reside B block can't be reused by A.
+                if Kc_blocks * Kr >= self.q_group_size():
+                    Kc_blocks = self.q_group_size() // Kr
+            # Step 2: Decide Mc assuming A block is L2-reside.
+            min_Mc_ratio = 2  # TODO(jgong5): something to tune?
+            min_Mc_blocks = math.ceil(min_Mc_ratio * Mr / Nr)
+            assert min_Mc_blocks >= 1
+            Kt_bytes = Kt_blocks * Kr * num_byte_A
+            if min_Mc_blocks * Mr * Kt_bytes < L2:
+                # Strategy 1: A (Mc x Kt) resides in L2 and reused by all Nt
+                # when Nc_blocks is kept 1. Mc should be large enough (>= min_Mc_blocks)
+                # to reuse B (Kc x Nr) in L1. This makes C (Mc x Nr) small enough to reside
+                # in L1.
+                Mc_blocks = min(Mt_blocks, math.floor(L2 / (Mr * Kt_bytes)))
+                Nc_blocks = 1
+            else:
+                # Strategy 2: Kt is too large to hold A (Mc x Kt) in L2, we reuse
+                # A (Mc x Kc) in L2 by B (Kc x Nc). C (Mc x Nc) resides in L2.
+                Mc_blocks = Mt_blocks
+                Nc_blocks = min(math.ceil(Mc_blocks * Mr / Nr), Nt_blocks)
+                Nc_bytes = Nc_blocks * Nr * 4  # assume C or acc is float32/int32
+                Kc_bytes = Kc_blocks * Kr * num_byte_A
+                if Mc_blocks * Mr * (Kc_bytes + Nc_bytes) > L2:
+                    # The following is the solution for 4*Mc*Nc + Mc*Kc_bytes = L2,
+                    # assuming Mc == Nc for good data reuse.
+                    M_max = (math.sqrt(Kc_bytes * Kc_bytes + 16 * L2) - Kc_bytes) / 8
+                    if M_max < Mc_blocks * Mr:
+                        Mc_blocks = math.floor(M_max / Mr)
+                        Nc_blocks = min(math.ceil(Mc_blocks * Mr / Nr), Nt_blocks)
+            return Mc_blocks, Nc_blocks, Kc_blocks
+        assert not self.is_dynamic_M, (
+            "Unable to determine cache blocking for dynamic M."
+        )
+        register_blocking = self.register_blocking
+        thread_blocking = self.thread_blocking(num_threads)
+        return GemmBlocking(*get_cache_blocking(register_blocking, thread_blocking))
+    def log_blockings(self):
+        log.debug(f"Register blocking: {self.register_blocking}")  # noqa: G004
+        if self.is_dynamic_M:
+            # thread and cache blockings are determined at runtime for dynamic shapes
+            return
+        log.debug(
+            f"Cache blocking: {self.cache_blocking(self.num_threads)}"  # noqa: G004
+        )
+        thread_blocking = self.thread_blocking(self.num_threads)
+        log.debug(f"Thread blocking: {thread_blocking}")  # noqa: G004
+        def get_occupancy():
+            m_blocks = math.ceil(self.m / self.register_blocking.block_m)
+            n_blocks = math.ceil(self.n / self.register_blocking.block_n)
+            k_blocks = math.ceil(self.k / self.register_blocking.block_k)
+            m = math.ceil(m_blocks / thread_blocking.block_m)
+            n = math.ceil(n_blocks / thread_blocking.block_n)
+            k = math.ceil(k_blocks / thread_blocking.block_k)
+            return (m, n, k)
+        log.debug(
+            f"Number of threads: {self.num_threads}, occupancy: {get_occupancy()}"  # noqa: G004
+        )
+    def maybe_k_slicing(self):
+        if self.num_threads == 1:
+            return False
+        if self.is_dynamic_M:
+            # TODO(jgong5): perhaps use size hint to decide?
+            return True
+        register_blocking = self.register_blocking
+        k_blocks = math.ceil(self.k / register_blocking.block_k)
+        thread_blocking = self.thread_blocking(self.num_threads)
+        return k_blocks > thread_blocking.block_k
+    @classmethod
+    def add_choices(
+        cls,
+        choices,
+        layout,
+        input_nodes,
+        beta=1,
+        alpha=1,
+        has_bias=False,
+        trans_w=False,
+        input_indices=None,
+        epilogue_creator: Optional[Callable[[ir.Buffer], ir.Pointwise]] = None,
+        act_mapping: Optional[dict[int, ir.IRNode]] = None,
+    ):
+        """
+        Add choices for the GEMM template.
+        """
+        # Fast path to save the epilogue calculation when x_scale/x_zp/w_scale are constant
+        use_int8_fast_compensation_path = _is_int8_gemm(input_nodes) and all(
+            (
+                isinstance(input_nodes[idx], ir.TensorBox)
+                and isinstance(input_nodes[idx].data.data, ir.ConstantBuffer)
+            )
+            for idx in [1, 2, 4]
+        )
+        if input_indices is None:
+            input_indices = list(range(len(input_nodes)))
+        only_one_input = (
+            input_nodes[0] == input_nodes[1] if len(input_nodes) > 1 else False
+        )
+        def reorder_and_filter(inputs, layout_or_out):
+            if has_bias:
+                assert len(input_indices) >= 3
+                # Assume the input order is [inp, x, w] and we reorder it to [x, w, inp]
+                inp_idx = input_indices[0]
+                x_idx = input_indices[1]
+                w_idx = input_indices[2]
+                return [
+                    inputs[x_idx],
+                    inputs[w_idx],
+                    inputs[inp_idx],
+                    *[inputs[idx] for idx in input_indices[3:]],
+                ], layout_or_out
+            elif len(inputs) >= len(input_indices):
+                assert len(input_indices) >= 2
+                return [inputs[idx] for idx in input_indices], layout_or_out
+            else:
+                # For when input is used for x and w, i.e. X@X.T or similar
+                # Assumes the first input is the only input
+                assert len(inputs) == 1
+                return [inputs[0]] * len(input_indices), layout_or_out
+        new_inputs, new_layout = reorder_and_filter(input_nodes, layout)
+        is_mkldnn_wgt = (
+            new_inputs[1].get_name() in V.graph.constants
+            and V.graph.constants[new_inputs[1].get_name()].is_mkldnn
+        )
+        if is_mkldnn_wgt:
+            # It shouldn't happen as viewing an mkldnn tensor, we can extend the
+            # implementation if it does.
+            assert not isinstance(new_inputs[1], ir.BaseView)
+        # Note that the layout of MKLDNN Tensor is with the wrong stride
+        view_size = new_inputs[1].layout.size
+        view_stride = new_inputs[1].layout.stride
+        view_offset = new_inputs[1].layout.offset
+        def maybe_to_dense(inputs, layout_or_out):
+            new_inputs = list(inputs)
+            if isinstance(inputs[1], torch.Tensor):
+                W = inputs[1]
+                new_inputs[1] = W.to_dense() if W.is_mkldnn else W
+            return new_inputs, layout_or_out
+        def normalize_shapes(inputs, layout_or_out):
+            new_inputs = list(inputs)
+            if not is_mkldnn_wgt and isinstance(new_inputs[1], torch.Tensor):
+                if has_free_symbols(view_size):
+                    # If batch size B is dynamic, we need to set the batch size and possibly stride
+                    assert not has_free_symbols(view_size[1:])
+                    view_size[:] = V.graph.sizevars.size_hints(view_size)
+                    view_stride[:] = V.graph.sizevars.size_hints(view_stride)
+                # With the assumptation that W is the storage of unwrap view
+                # thus view it back here
+                new_inputs[1] = new_inputs[1].as_strided(
+                    view_size, view_stride, view_offset
+                )
+            if not trans_w:
+                return new_inputs, layout_or_out
+            X = new_inputs[0]
+            W = new_inputs[1]
+            B = new_inputs[2] if has_bias else None
+            W = transpose_w(W, trans_w)
+            B = expand_bias(B, X)  # type:ignore[arg-type]
+            new_inputs[1] = W
+            if B is not None:
+                new_inputs[2] = B
+            return new_inputs, layout_or_out
+        # TODO(jgong5): decide proper number of threads per problem size
+        num_threads = parallel_num_threads()
+        new_inputs, _ = normalize_shapes(*maybe_to_dense(new_inputs, new_layout))
+        m, n, k, *_ = mm_args(
+            new_inputs[0],
+            new_inputs[1],
+            mat2_transposed=cls.is_woq_int4(),
+            use_4x2_dim=cls.is_woq_int4(),
+        )
+        output_dtype, compute_dtype = get_gemm_template_output_and_compute_dtype(
+            new_inputs[0].get_dtype()
+        )
+        micro_gemm = create_micro_gemm(
+            "micro_gemm",
+            m,
+            n,
+            k,
+            input_dtype=new_inputs[0].get_dtype(),
+            input2_dtype=new_inputs[1].get_dtype(),
+            output_dtype=output_dtype,
+            compute_dtype=compute_dtype,
+            alpha=alpha,
+            num_threads=num_threads,
+            use_ref=not cls.is_woq_int4(),
+            q_group_size=cls.q_group_size(),
+        )
+        assert micro_gemm is not None
+        pre_block_weights = cls.check_if_block_weight(new_inputs[1], micro_gemm)
+        micro_gemm.use_local_vnni_blocking(not pre_block_weights)
+        def preprocessor(inputs, layout):
+            new_inputs, new_layout = normalize_shapes(
+                *maybe_to_dense(*reorder_and_filter(inputs, layout))
+            )
+            if only_one_input and isinstance(new_inputs[0], torch.Tensor):
+                return new_inputs[1:], new_layout
+            return cls.prep_weight(
+                new_inputs,
+                new_layout,
+                micro_gemm,
+                pre_block_weights,
+                use_int8_fast_compensation_path,
+            )
+        def postprocessor(output):
+            if isinstance(output, ir.TensorBox):
+                # prepack the weight as input to the template buffer
+                template_buffer = ir.InputsKernel.unwrap_storage_for_input(output)
+                assert isinstance(template_buffer, ir.CppTemplateBuffer)
+                new_input_nodes, _ = reorder_and_filter(input_nodes, layout)
+                W_node = new_input_nodes[1]
+                if W_node.get_name() not in V.graph.constants:
+                    return output
+                W = V.graph.constants[W_node.get_name()]
+                new_input_nodes[1] = W
+                new_input_nodes, new_layout = normalize_shapes(
+                    *maybe_to_dense(new_input_nodes, layout)
+                )
+                new_input_nodes, _ = cls.prep_weight(
+                    new_input_nodes,
+                    new_layout,
+                    micro_gemm,
+                    pre_block_weights,
+                    use_int8_fast_compensation_path,
+                    skip_int8_compensation=True,
+                )
+                W_packed = new_input_nodes[1]
+                W_packed_constant = V.graph.add_tensor_constant(W_packed)
+                new_input_nodes[1] = W_packed_constant
+                # Prune unused tensors
+                prune_tensors(input_nodes, new_input_nodes)
+                template_buffer.inputs[1] = ir.InputsKernel.unwrap_storage_for_input(
+                    W_packed_constant
+                )
+            return output
+        template = DataProcessorTemplateWrapper(
+            cls,
+            preprocessor,
+            postprocessor,
+            input_nodes=input_nodes,
+            layout=layout,
+            num_threads=num_threads,
+            register_blocking=micro_gemm.register_blocking,
+            beta=beta,
+            alpha=alpha,
+            has_bias=has_bias,
+            epilogue_creator=epilogue_creator,
+            should_block_weights=pre_block_weights,
+            name=micro_gemm.__class__.__name__,
+        )
+        template.maybe_append_choice(choices)
+        return template
+    @staticmethod
+    def get_padded_size(n, block_n, k, should_block_weight):
+        padded_n = get_padded_n(n, block_n)
+        # We assume that all GEMM weight tensors should be blocked and padded
+        new_size = [padded_n // block_n, k, block_n]
+        return new_size, padded_n
+    @classmethod
+    def prep_weight(
+        cls,
+        inputs,
+        layout: ir.Layout,
+        micro_gemm: CppMicroGemm,
+        should_block_weight: bool,
+        use_int8_fast_compensation_path: bool = False,
+        skip_int8_compensation: bool = False,
+    ):
+        """
+        NOTE Weight prep consists of 2 separate steps:
+        1. Blocking the weight tensor into a 3D shape: [n//block_n, k, block_n]
+           This is always done if the weight tensor is constant, i.e. for all GEMM and some BMM.
+           For BMM, we also block non-contiguous weight tensors, since they would be reshaped anyway.
+           This assumes that blocked, contiguous weights will be more efficient for the GEMM kernel,
+           and is worth the overhead of reshape and blocking.
+           This blocking includes additional padding, when n is not a multiple of block_n.
+           This padding allows a more efficient microkernel implementation. For BMM, this is only done
+           if reshape would happen anyway, i.e.  if the weight tensor is constant, is not contiguous,
+           or is using AMX VNNI layout.
+        2. Packing the weight tensor into a VNNI-friendly shape. For constant input,
+           this is done at the same time as the weight blocking.
+        At compile time, the constant weight tensors are blocked and packed. For non-constant tensors (e.g. BMM)
+        which will be blocked (non-contiguous or VNNI-layout tensors), the weight tensor is blocked and packed at runtime.
+        CppBmmTemplate overrides the methods get_padded_size, and block_weight in order to accommodate
+        an additional dimension for the batch size and to determine if the weight tensor should be blocked.
+        """
+        W = inputs[1]
+        new_inputs = list(inputs)
+        if cls.is_woq_int4():
+            assert (
+                len(W.get_size()) == 2
+                if isinstance(W, ir.IRNode)
+                else len(W.shape) == 2
+            )
+            n, k = W.get_size() if isinstance(W, ir.IRNode) else W.shape
+        else:
+            k, n = W.get_size()[-2:] if isinstance(W, ir.IRNode) else W.shape[-2:]
+        _, block_n, _ = micro_gemm.register_blocking
+        new_size, padded_n = cls.get_padded_size(n, block_n, k, should_block_weight)
+        padding = padded_n - n
+        if should_block_weight and not cls.is_woq_int4():
+            blocked_w = cls.block_weight(W, new_size, padding)
+            new_inputs[1] = cls.pack_vnni_weight(blocked_w, micro_gemm, new_size)
+        elif should_block_weight:
+            assert cls.is_woq_int4()
+            new_inputs[1] = cls.block_weight(W, new_size, padding)
+        elif isinstance(W, ir.IRNode):
+            # Require W layout to be fixed & contiguous, happens inplace.
+            ir.ExternKernel.require_contiguous(W)
+        if not skip_int8_compensation and _is_int8_gemm(new_inputs):
+            BCompensate = None
+            x_w_scale = None
+            def _get_compensation_node(W, use_int8_fast_compensation_path):
+                BCompensate = V.graph.add_tensor_constant(
+                    V.graph.constants[W.get_name() + "_BMatrixCompens"],
+                    W.get_name() + "_BMatrixCompens",
+                )
+                x_w_scale = None
+                if use_int8_fast_compensation_path:
+                    x_w_scale = V.graph.add_tensor_constant(
+                        V.graph.constants[W.get_name() + "_x_w_compens"],
+                        W.get_name() + "_x_w_compens",
+                    )
+                return BCompensate, x_w_scale
+            if use_int8_fast_compensation_path:
+                # new_inputs has been reordered: [x, w, optional[bias], x_scale, x_zp, w_scale, w_zp]
+                x_scale = new_inputs[-4]
+                x_zp = new_inputs[-3]
+                w_scale = new_inputs[-2]
+                if isinstance(W, ir.IRNode):
+                    BCompensate, x_w_scale = _get_compensation_node(
+                        W, use_int8_fast_compensation_path
+                    )
+                else:
+                    # Use the original W, not the blocked_w in new_inputs[1] to calculate BCompensate
+                    BCompensate = torch.sum(W.to_dense().to(torch.float), dim=0)  # type: ignore[assignment]
+                    assert all(
+                        isinstance(item, torch.Tensor)
+                        for item in (x_scale, x_zp, w_scale)
+                    )
+                    BCompensate = BCompensate * x_scale * w_scale * x_zp
+                    x_w_scale = x_scale * w_scale
+                new_inputs.append(BCompensate)
+                new_inputs.append(x_w_scale)
+            else:
+                if isinstance(W, ir.IRNode):
+                    BCompensate, _ = _get_compensation_node(
+                        W, use_int8_fast_compensation_path
+                    )
+                else:
+                    # Use the original W, not the blocked_w in new_inputs[1] to calculate BCompensate
+                    BCompensate = torch.sum(W.to_dense().to(torch.float), dim=0)  # type: ignore[assignment]
+                new_inputs.append(BCompensate)
+        return new_inputs, layout
+    @staticmethod
+    def check_if_block_weight(W, micro_gemm):
+        return True
+    @classmethod
+    def block_weight(cls, W, new_size, padding):
+        # These are separated into two methods to allow subclasses to override them separately
+        if isinstance(W, ir.IRNode):
+            if W.get_name() in V.graph.constants:
+                # Create a new buffer, representing the constant blocked tensor
+                blocked_w = ir.Buffer(
+                    name=W.get_name(),  # Borrow the registered buffer name
+                    layout=ir.FixedLayout(
+                        W.get_device_or_error(),
+                        W.get_dtype(),
+                        new_size,
+                        ir.FlexibleLayout.contiguous_strides(new_size),
+                        0,
+                    ),
+                )
+            else:
+                if not isinstance(W, ir.TensorBox):
+                    W = ir.TensorBox(W)
+                permute_dims = list(range(len(new_size)))
+                permute_dims[-2], permute_dims[-3] = permute_dims[-3], permute_dims[-2]
+                permute_size = list(new_size)
+                permute_size[-2], permute_size[-3] = permute_size[-3], permute_size[-2]
+                blocked_w = L.constant_pad_nd(W, (0, padding))
+                blocked_w = L.permute(
+                    L.view(blocked_w, permute_size),
+                    permute_dims,
+                )
+        else:
+            assert isinstance(W, torch.Tensor)
+            # Pad the weight tensor and reshape it into a 3D blocked shape
+            blocked_size = list(new_size)
+            blocked_size[-2], blocked_size[-3] = blocked_size[-3], blocked_size[-2]
+            blocked_w = (
+                torch.nn.functional.pad(W, (0, padding))  # type: ignore[assignment]
+                .reshape(*blocked_size)
+                .transpose(-3, -2)
+                .contiguous()
+            )
+        return blocked_w
+    @classmethod
+    def pack_vnni_weight(cls, W, micro_gemm, new_size):
+        # WOQ INT4 weights are reordered in microkernel so do not pack them here
+        should_pack = (
+            micro_gemm.get_b_layout() != LayoutType.NORMAL
+            and not micro_gemm.is_woq_int4()
+        )
+        # These are separated into two methods to allow subclasses to override them separately
+        if isinstance(W, ir.IRNode):
+            if isinstance(W, ir.Buffer) and W.get_name() in V.graph.constants:
+                return W
+            k = new_size[-2]
+            if not isinstance(W, ir.TensorBox):
+                W = ir.TensorBox(W)
+            if should_pack:
+                permute_dims = list(range(len(new_size) + 1))
+                permute_dims[-1], permute_dims[-2] = permute_dims[-2], permute_dims[-1]
+                vnni_size = 4 if micro_gemm.get_b_layout() == LayoutType.VNNI4 else 2
+                vnni_view_size = list(new_size)
+                vnni_view_size[-2] = k // vnni_size
+                vnni_view_size.insert(-1, vnni_size)
+                W = L.view(
+                    L.permute(L.view(W, vnni_view_size), permute_dims),
+                    new_size,
+                )
+            W = ir.ExternKernel.realize_input(W)
+            W = ir.ExternKernel.require_contiguous(W)
+            return W
+        else:
+            k = new_size[-2]
+            # Apply VNNI packing to the weight tensor
+            if should_pack:
+                # TODO: Move VNNI weight packing for non-constant tensors into the template,
+                # to improve cache locality and avoid full-tensor copy.
+                layout_str = (
+                    "VNNI4"
+                    if micro_gemm.get_b_layout() == LayoutType.VNNI4
+                    else "VNNI2"
+                )
+                assert micro_gemm.get_b_layout() in [
+                    LayoutType.VNNI2,
+                    LayoutType.VNNI4,
+                ], f"We only support {layout_str} for now"
+                vnni_size = 4 if micro_gemm.get_b_layout() == LayoutType.VNNI4 else 2
+                assert k % vnni_size == 0, (
+                    f"k should be divisible by vnni_size for {layout_str} layout"
+                )
+                vnni_view_size = list(new_size)
+                vnni_view_size[-2] = k // vnni_size
+                vnni_view_size.insert(-1, vnni_size)
+                W = W.view(vnni_view_size).transpose(-1, -2).contiguous().view(new_size)
+            # normalize stride to be "contiguous_strides" per size
+            # this avoids the problems in L.view during template codegen
+            new_stride = [1]
+            for sz in reversed(W.shape[1:]):
+                new_stride.insert(0, new_stride[0] * sz)
+            W = W.as_strided(W.shape, new_stride)
+            return W
+    def get_default_reindexers(self, epilogue_nodes):
+        return [None] * len(epilogue_nodes)
+    def get_options(
+        self,
+        kernel: CppTemplateKernel,
+        template_buffer_node: Optional[ir.CppTemplateBuffer] = None,
+        flag_template_buffer_has_other_users: Optional[bool] = None,
+        epilogue_nodes: Optional[list[ir.IRNode]] = None,
+    ) -> dict[str, Any]:
+        assert len(self.input_nodes) >= 2
+        int8_gemm = self.input_nodes[0].get_dtype() in [torch.uint8, torch.int8]
+        x_scale = None
+        x_zp = None
+        w_scale = None
+        w_zp = None
+        inp = None
+        q_group_size_node = None
+        qscale_and_zeros = None
+        if int8_gemm:
+            X, W = self.input_nodes[0], self.input_nodes[1]
+            bias_idx = 2 if self.has_bias else 1
+            inp = self.input_nodes[bias_idx] if self.has_bias else None
+            x_scale = self.input_nodes[bias_idx + 1]
+            x_zp = self.input_nodes[bias_idx + 2]
+            w_scale = self.input_nodes[bias_idx + 3]
+            w_zp = self.input_nodes[bias_idx + 4]
+            Y = self.output_node
+        elif self.is_woq_int4():
+            X, W = self.input_nodes[0], self.input_nodes[1]
+            Y = self.output_node
+            q_group_size_node = self.input_nodes[2]
+            qscale_and_zeros = self.input_nodes[3]
+        else:
+            X, W = self.input_nodes[0], self.input_nodes[1]
+            Y = self.output_node
+            inp = self.input_nodes[2] if self.has_bias else None
+        template_buffer_has_other_users = None
+        if template_buffer_node is not None:
+            # Use the updated prepacked weight buffer
+            W = template_buffer_node.inputs[1]
+            Y = template_buffer_node
+            assert flag_template_buffer_has_other_users is not None
+            template_buffer_has_other_users = flag_template_buffer_has_other_users
+        template_buffer = Y
+        gemm_output_buffer = template_buffer
+        epilogues: list[ir.IRNode] = []
+        reindexers: list[Optional[Callable[[list[Any]], list[Any]]]] = []
+        epilogue_creators: list[Callable[[ir.Buffer], ir.Pointwise]] = []
+        fake_buffers: list[ir.Buffer] = []
+        Y_aliases: OrderedSet[str] = OrderedSet()
+        use_local_acc = (
+            self.layout.dtype != torch.float
+            or template_buffer_has_other_users
+            or int8_gemm
+            or self.padded_n != self.n
+            or self.maybe_k_slicing()
+            or (epilogue_nodes and epilogue_nodes[-1].get_dtype() != self.layout.dtype)
+        )
+        # TODO(jgong5): for int8 gemm, bias-add is handled outside of gemm template,
+        # but we'd better move it here to align with fp.
+        if inp is not None and self.beta != 0 and not int8_gemm:
+            # add an epilogue for bias add
+            def _bias_add_epilogue(buf):
+                return create_epilogue_with_attr(
+                    buf, "bias_add", other=inp, beta=self.beta, dtype=self.layout.dtype
+                )
+            epilogue_creators.append(_bias_add_epilogue)
+        if self.epilogue_creator is not None:
+            epilogue_creators.append(self.epilogue_creator)
+        # When the GEMM output buffer is localized but it has users other than the epilogue nodes,
+        # we need to copy the value in the GEMM output local buffer to a global buffer.
+        def need_copy_from_local_to_global_buffer_epilogue(
+            use_local_acc, template_buffer_has_other_users, epilogue_creators
+        ):
+            # The GEMM output buffer is a global buffer, thus copy is not needed.
+            if not use_local_acc:
+                return False
+            # The possible value of template_buffer_has_other_users is (None, False, True)
+            # It is None when generating the gemm template during autotune and it will have value during scheduler codegen.
+            # extra copy_from_local_to_global_buffer_epilogue is not needed in either of the below two cases:
+            #   1. template_buffer_has_other_users is None (i.e. when doing the codegen during autotune)
+            #   2. template_buffer_has_other_users is False, which means it's safe to keep the value in the
+            #       GEMM output buffer in local buffer only (no users outside of the epilogues will use its value).
+            if not template_buffer_has_other_users:
+                return False
+            # When bias is not None or self.epilogue_creator is not None,
+            # there will be epilogue_creators after the GEMM.
+            # The GEMM output buffer is localized while
+            # the output buffer of the epilogue_creators is a global buffer.
+            if epilogue_creators:
+                return False
+            return True
+        if need_copy_from_local_to_global_buffer_epilogue(
+            use_local_acc, template_buffer_has_other_users, epilogue_creators
+        ):
+            def copy_from_local_to_global_buffer_epilogue(input_buffer: ir.Buffer):
+                dtype = self.layout.dtype
+                input_loader = input_buffer.make_loader()
+                def copy_inner(index):
+                    input = input_loader(index)
+                    result = ops.to_dtype(input, dtype)
+                    return result
+                return ir.Pointwise(
+                    device=input_buffer.get_device_or_error(),
+                    dtype=self.layout.dtype,
+                    inner_fn=copy_inner,
+                    ranges=input_buffer.get_size(),
+                )
+            epilogue_creators.append(copy_from_local_to_global_buffer_epilogue)
+        # NOTE [How CPP GEMM template epilogues are organized]
+        #   gemm_output_buffer
+        #     --> zero or more in-template epilogues (created by `epilogue_creators`) -->
+        #   template_buffer
+        #     --> zero or more out-of-template epilogues (`epilogue_nodes`) -->
+        #   Y
+        if epilogue_creators:
+            assert isinstance(template_buffer, ir.IRNode)
+            gemm_output_name = f"{template_buffer.get_name()}_GemmOut"
+            gemm_output_buffer = ir.Buffer(
+                name=gemm_output_name, layout=template_buffer.layout
+            )
+            current_input_buffer = gemm_output_buffer
+            for i, creator in enumerate(epilogue_creators):
+                if i == len(epilogue_creators) - 1:
+                    buffer_name = template_buffer.get_name()
+                else:
+                    buffer_name = f"{gemm_output_name}_epilogue_{i}"
+                epilogues.append(
+                    ir.ComputedBuffer(
+                        name=buffer_name,
+                        layout=template_buffer.layout,
+                        data=creator(current_input_buffer),
+                    )
+                )
+                fake_buffers.append(current_input_buffer)
+                Y_aliases.add(current_input_buffer.get_name())
+                reindexers.append(None)
+                if i < len(epilogue_creators) - 1:
+                    current_input_buffer = ir.Buffer(
+                        name=buffer_name, layout=template_buffer.layout
+                    )
+        assert isinstance(Y, (ir.Buffer, ir.ReinterpretView))
+        Y_2d: Union[ir.Buffer, ir.ReinterpretView] = Y
+        if epilogue_nodes:
+            if not template_buffer_has_other_users:
+                assert isinstance(template_buffer, ir.IRNode)
+                Y_aliases.add(template_buffer.get_name())
+            epilogues.extend(epilogue_nodes)
+            assert Y.get_numel() == epilogues[-1].get_numel()
+            Y = cast(ir.Buffer, epilogues[-1])
+            assert isinstance(template_buffer, ir.Buffer)
+            Y_2d, reindexers = gen_2d_view_of_epilogue_buf(
+                Y,
+                template_buffer,
+                epilogue_nodes,
+                reindexers,
+                default_reindexers=self.get_default_reindexers(epilogue_nodes),
+            )
+        output_dtype, compute_dtype = get_gemm_template_output_and_compute_dtype(
+            X.get_dtype()
+        )
+        micro_gemm = create_micro_gemm(
+            f"{kernel.kernel_name}_micro_gemm",
+            self.m,
+            self.n,
+            self.k,
+            input_dtype=X.get_dtype(),
+            input2_dtype=W.get_dtype(),
+            output_dtype=output_dtype,
+            compute_dtype=compute_dtype,
+            alpha=self.alpha,
+            num_threads=self.num_threads,
+            use_ref=not self.is_woq_int4(),
+            q_group_size=self.q_group_size(),
+        )
+        assert micro_gemm is not None
+        micro_gemm.use_local_vnni_blocking(not self.should_block_weights)
+        assert self.register_blocking == micro_gemm.register_blocking
+        self.log_blockings()
+        if isinstance(micro_gemm, CppMicroGemmAMX):
+            counters["inductor"]["cpp_micro_gemm_amx_counter"] += 1
+        if isinstance(micro_gemm, CppMicroBrgemm):
+            counters["inductor"]["cpp_micro_brgemm_counter"] += 1
+        L1_cache_size = torch._C._cpu._L1d_cache_size()  # per core cache size in Bytes
+        assert L1_cache_size > 0, f"Expect L1_cache_size > 0 but got {L1_cache_size}"
+        L2_cache_size = torch._C._cpu._L2_cache_size()  # per core cache size in Bytes
+        assert L2_cache_size > 0, f"Expect L2_cache_size > 0 but got {L2_cache_size}"
+        options = dict(
+            X=X,
+            W=W,
+            inp=inp,
+            Y=Y,
+            N=self.n,
+            K=self.k,
+            PADDED_N=self.padded_n,
+            GemmOut=gemm_output_buffer,
+            aliases={alias: Y.get_name() for alias in Y_aliases},
+            beta=self.beta,
+            alpha=self.alpha,
+            num_threads=self.num_threads,
+            micro_gemm=micro_gemm,
+            is_dynamic_M=self.is_dynamic_M,
+            template=self,
+            kernel=kernel,
+            export_declaration=get_export_declaration(),
+            epilogue_nodes=epilogues,
+            reindexers=reindexers,
+            Y_2d=Y_2d,
+            use_local_acc=use_local_acc,
+            maybe_k_slicing=self.maybe_k_slicing(),
+            x_scale=x_scale,
+            x_zp=x_zp,
+            w_scale=w_scale,
+            w_zp=w_zp,
+            acc_buf_dtype=torch.int32 if int8_gemm else torch.float,
+            DTYPE_TO_CPP=DTYPE_TO_CPP,
+            L1_cache_size=L1_cache_size,
+            L2_cache_size=L2_cache_size,
+            config=config,
+            fake_buffers=fake_buffers,
+            is_woq_int4=self.is_woq_int4(),
+            q_group_size=q_group_size_node,
+            qscale_and_zeros=qscale_and_zeros,
+        )
+        return options
+    def is_int8_woq_gemm_small_m_dim(
+        self,
+        X: ir.ReinterpretView,
+        W: ir.ReinterpretView,
+        N,
+        K,
+        micro_gemm,
+    ):
+        """Use SMALL_M_GEMM_TEMPLATE"""
+        return (
+            isinstance(micro_gemm, CppMicroGemmFP32Vec)
+            and is_int8_woq_gemm_small_m_dim_corner_case(
+                micro_gemm, X.get_size()[0], N, K
+            )
+            and X.get_dtype() is torch.bfloat16
+            and W.get_dtype() is torch.int8
+        )
+    def render(  # type: ignore[override, return]
+        self,
+        kernel: CppTemplateKernel,
+        template_buffer_node: Optional[ir.CppTemplateBuffer] = None,
+        flag_template_buffer_has_other_users: Optional[bool] = None,
+        epilogue_nodes: Optional[list[ir.IRNode]] = None,
+        **kwargs,
+    ) -> str:
+        options = self.get_options(
+            kernel=kernel,
+            template_buffer_node=template_buffer_node,
+            flag_template_buffer_has_other_users=flag_template_buffer_has_other_users,
+            epilogue_nodes=epilogue_nodes,
+        )
+        self.render_options = options
+        with contextlib.ExitStack() as stack:
+            for buf in options["fake_buffers"]:
+                stack.enter_context(
+                    patch.object(V.graph, "get_dtype", self._fake_get_dtype(buf))
+                )
+            if not options["is_dynamic_M"] and self.is_int8_woq_gemm_small_m_dim(
+                options["X"],
+                options["W"],
+                options["N"],
+                options["K"],
+                options["micro_gemm"],
+            ):
+                template_str = SMALL_M_GEMM_TEMPLATE
+            else:
+                template_str = GEMM_TEMPLATE
+            return self._template_from_string(template_str).render(**options)
+    def codegen_blocks(
+        self,
+        num_threads,
+        N,
+        K,
+        micro_gemm,
+        is_dynamic_M,
+        kernel,
+        GemmOut,
+        config,
+        L1_cache_size,
+        L2_cache_size,
+        X,
+        W,
+    ):
+        options = dict(
+            num_threads=num_threads,
+            N=N,
+            K=K,
+            micro_gemm=micro_gemm,
+            is_dynamic_M=is_dynamic_M,
+            kernel=kernel,
+            GemmOut=GemmOut,
+            config=config,
+            L1_cache_size=L1_cache_size,
+            L2_cache_size=L2_cache_size,
+            template=self,
+            X=X,
+            W=W,
+            is_woq_int4=self.is_woq_int4(),
+        )
+        template_str = GEMM_TEMPLATE_INIT_BLOCKING_BASIC_BLOCK
+        if not (
+            not is_dynamic_M
+            and self.is_int8_woq_gemm_small_m_dim(X, W, N, K, micro_gemm)
+        ):
+            template_str += GEMM_TEMPLATE_INIT_BLOCKING_EXTENDED
+        return self._template_from_string(template_str).render(options)
+    def codegen_microkernel_def(self):
+        return self._template_from_string(GEMM_TEMPLATE_MICROKERNEL_DEF).render(
+            self.render_options
+        )
+    def codegen_gemm_stub_def(self):
+        microkernel = self.codegen_microkernel_def()
+        return microkernel + self._template_from_string(GEMM_TEMPLATE_STUB_DEF).render(
+            self.render_options
+        )
+    def codegen_multi_threads_params(self):
+        return self._template_from_string(GEMM_TEMPLATE_MULTI_THREADS_PARAMS).render()
+    def codegen_single_thread_params(self, is_dynamic_M):
+        options = dict(
+            is_dynamic_M=is_dynamic_M,
+        )
+        return self._template_from_string(GEMM_TEMPLATE_SINGLE_THREAD_PARAMS).render(
+            options
+        )
+    def codegen_m_loop_params(self):
+        return self._template_from_string(GEMM_TEMPLATE_M_LOOP_PARAMS).render()
+    def codegen_n_loop_params(self):
+        return self._template_from_string(GEMM_TEMPLATE_N_LOOP_PARAMS).render()
+    @classmethod
+    def is_woq_int4(cls):
+        return False
+    @classmethod
+    def q_group_size(cls):
+        return None
+class CppWoqInt4GemmTemplateMeta(type):
+    def __getitem__(cls, q_group_size):
+        class CppWoqInt4GemmTemplateInstance(CppGemmTemplate):
+            def __init__(
+                self,
+                *args,
+                **kwargs,
+            ) -> None:
+                super().__init__(
+                    *args,
+                    **kwargs,
+                )
+            @classmethod
+            def is_woq_int4(cls):
+                return True
+            @classmethod
+            def q_group_size(cls):
+                return q_group_size
+            @staticmethod
+            def check_if_block_weight(W, micro_gemm):
+                # For WOQ INT4, weight is already packed
+                # However, for AMX microkernel, we want to change the blocking of weight
+                from .cpp_micro_gemm import CppMicroGemmWoQInt4Amx
+                return isinstance(micro_gemm, CppMicroGemmWoQInt4Amx)
+            @classmethod
+            def block_weight(cls, W, new_size, padding):
+                # This method is called only if AMX microkernels are used.
+                # In this case, we unpack and repack weight so that block_n=32
+                # the format of packed weight is described here:
+                # https://github.com/pytorch/pytorch/blob/32eee8ed225d9f10fbbcb38c24b8b44c24c0c97c/aten/src/ATen/native/cpu/int4mm_kernel.cpp#L583
+                if isinstance(W, ir.IRNode):
+                    # in this case, we do nothing
+                    ir.ExternKernel.require_contiguous(W)
+                    blocked_w = W
+                else:
+                    # in this case, we unpack and repack weight
+                    assert isinstance(W, torch.Tensor)
+                    assert W.dim() == 2
+                    N = W.size(0)
+                    K = W.size(-1) * 2
+                    G = cls.q_group_size()
+                    # x and qscales_and_zeros are in bfloat16 instead of float to use the optimized kernel
+                    # so that the unpacking process is faster
+                    x = torch.eye(K).bfloat16()
+                    # Here we use scale=1 and qzero=8 because we want to unpack weight
+                    # without dequantizing it. The qzero here is 8 instead of 0 because
+                    # int4 values are converted to [-7, 8] in the _weight_int4pack_mm_for_cpu kernel:
+                    # https://github.com/pytorch/pytorch/blob/32eee8ed225d9f10fbbcb38c24b8b44c24c0c97c/aten/src/ATen/native/cpu/int4mm_kernel.cpp#L95
+                    qscales_and_zeros = (
+                        torch.tensor([1.0, 8.0])
+                        .bfloat16()
+                        .expand(K // G, N, 2)
+                        .contiguous()
+                    )
+                    # shape: [K, N]
+                    unpacked_w = torch.ops.aten._weight_int4pack_mm_for_cpu(
+                        x,
+                        W,
+                        G,
+                        qscales_and_zeros,
+                    ).to(torch.uint8)
+                    block_n = 32
+                    # shape: [N // block_n, K, block_n]
+                    w_blocked = (
+                        unpacked_w.view(K, N // block_n, block_n)
+                        .permute(1, 0, 2)
+                        .contiguous()
+                    )
+                    # pack 2 int4 -> 1 int8
+                    # block_n: [a0, a1, ..., a15, b0, b1, ..., b15]
+                    # -> [(a0 & 0xf) | (b0 << 4), (a1 & 0xf) | (b1 << 4), ...]
+                    # shape: [N // block_n, K, 2, block_n // 2]
+                    w_blocked = w_blocked.view(N // block_n, K, 2, block_n // 2)
+                    # shape: [N // block_n, K, block_n // 2]
+                    w_blocked_packed = (w_blocked[:, :, 0, :] & 0xF) | (
+                        w_blocked[:, :, 1, :] << 4
+                    )
+                    # shape: [N, K // 2]
+                    blocked_w = w_blocked_packed.view(N, K // 2)
+                return blocked_w
+        return CppWoqInt4GemmTemplateInstance
+class CppWoqInt4GemmTemplate(metaclass=CppWoqInt4GemmTemplateMeta):
+    pass

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_grouped_gemm_template.py ADDED Viewed

	@@ -0,0 +1,500 @@

+import contextlib
+import logging
+from typing import Any, Callable, cast, Optional, TypeVar
+from unittest.mock import patch
+import torch
+import torch.utils
+from torch.utils._ordered_set import OrderedSet
+from ..._dynamo.utils import counters
+from .. import config, ir
+from ..kernel.mm_common import mm_args
+from ..select_algorithm import ChoiceCaller, DataProcessorTemplateWrapper
+from ..utils import parallel_num_threads
+from ..virtualized import V
+from .cpp import get_export_declaration
+from .cpp_gemm_template import (
+    CppGemmTemplate,
+    expand_bias,
+    gen_2d_view_of_epilogue_buf,
+    prune_tensors,
+    transpose_w,
+)
+from .cpp_micro_gemm import CppMicroGemmAMX, create_micro_gemm
+from .cpp_template_kernel import CppTemplateKernel
+from .cpp_utils import (
+    create_epilogue_with_attr,
+    DTYPE_TO_CPP,
+    GemmBlocking,
+    get_gemm_template_output_and_compute_dtype,
+)
+log = logging.getLogger(__name__)
+GEMM_TEMPLATE = r"""
+{{template.header().getvalue()}}
+{{micro_gemm.codegen_define(kernel)}}
+extern "C" {{export_declaration}}
+{{kernel.def_kernel(inputs=kernel_args, outputs=Y_list, aliases=aliases)}}
+{
+    {{kernel.maybe_codegen_profile()}}
+    {{ template.codegen_blocks(
+        num_threads, N, K, micro_gemm, is_dynamic_M, kernel, GemmOuts[0], config, L1_cache_size, L2_cache_size, X_list[0], W_list[0]
+    ) }}
+{%- if num_threads > 1 %}
+    #pragma omp parallel num_threads({{num_threads}})
+    {
+        {{ template.codegen_multi_threads_params()|indent(8, false) }}
+{%- else %}
+    {
+        {{ template.codegen_single_thread_params(is_dynamic_M)|indent(8, false) }}
+{%- endif %}
+        {{ micro_gemm.codegen_init(kernel) }}
+{%- set acc_buf_name_list=[] %}
+{%- set acc_buf_name_prefix = "local_acc_buf_" %}
+{%- for gemm_idx in range(0, gemm_grouped_num, 1) %}
+    {%- set acc_buf_name = acc_buf_name_prefix + gemm_idx|string %}
+    {{ kernel.define_buffer(acc_buf_name, ["Mc_blocks*Mr", "Nc_blocks*Nr"], acc_buf_dtype) }}
+    {%- set acc_buf_name_list=acc_buf_name_list.append(acc_buf_name) %}
+{%- endfor %}
+        for (int64_t mc_block_id = 0; mc_block_id < num_Mc_blocks_per_thread; mc_block_id++) {
+            {{ template.codegen_m_loop_params()|indent(12, false) }}
+            for (int64_t nc = n_block_start; nc < n_block_end; nc += Nc_blocks) {
+                {{ template.codegen_n_loop_params()|indent(16, false) }}
+{%- set acc_list=[] %}
+{%- for gemm_idx in range(0, gemm_grouped_num, 1) %}
+    {%- set acc_list = acc_list.append( kernel.local_buffers[acc_buf_name_list[gemm_idx]] ) %}
+    {{ kernel.reinit_buffer_if_null(acc_buf_name_list[gemm_idx]) }}
+{%- endfor %}
+                for (int64_t kc = k_block_start; kc < k_block_end; kc += Kc_blocks) {
+                    int64_t k_start = kc * Kr;
+                    int64_t k_end = std::min(std::min(kc + Kc_blocks, k_block_end) * Kr, K);
+{%- set tile_X_list=[] %}
+{%- for gemm_idx in range(0, gemm_grouped_num, 1) %}
+    {%- set tile_X_list = tile_X_list.append( kernel.slice_nd(X_list[gemm_idx], [("m_start", "m_end"), ("k_start", "k_end")]) ) %}
+{%- endfor %}
+                    for (int64_t nci = nc; nci < nc_block_end; nci++) {
+{%- set tile_W_3d_list=[] %}
+{%- set tile_W_list=[] %}
+{%- set acc_slice_list=[] %}
+{%- for gemm_idx in range(0, gemm_grouped_num, 1) %}
+    {%- set acc_slice_list = acc_slice_list.append(
+        kernel.slice_nd(acc_list[gemm_idx], [("0", "m_end - m_start"), ("(nci - nc)*Nr", "(nci - nc + 1)*Nr")])
+    ) %}
+    {%- set tile_W_3d_list = tile_W_3d_list.append(
+        kernel.slice_nd(W_list[gemm_idx], [("nci", "nci + 1"), ("k_start", "k_end"), ()])
+    ) %}
+{%- endfor %}
+{%- for gemm_idx in range(0, gemm_grouped_num, 1) %}
+    {%- set tile_W_list = tile_W_list.append(
+        kernel.view(tile_W_3d_list[gemm_idx], ["k_end - k_start", micro_gemm.register_blocking.block_n])
+    ) %}
+{%- endfor %}
+                        if (kc == k_block_start) {
+                            {%- for gemm_idx in range(0, gemm_grouped_num, 1) %}
+                                {{ micro_gemm.codegen_call(
+                                    kernel, tile_X_list[gemm_idx], tile_W_list[gemm_idx], acc_slice_list[gemm_idx], accum=False
+                                )|indent(28, false) }}
+                            {%- endfor %}
+                        } else {
+                            {%- for gemm_idx in range(0, gemm_grouped_num, 1) %}
+                                {{ micro_gemm.codegen_call(
+                                    kernel, tile_X_list[gemm_idx], tile_W_list[gemm_idx], acc_slice_list[gemm_idx], accum=True
+                                )|indent(28, false) }}
+                            {%- endfor %}
+                        }
+                    }
+                }
+                {
+{%- set tile_acc_list = [] %}
+{%- set tile_Y_list = [] %}
+{%- for gemm_idx in range(0, gemm_grouped_num, 1) %}
+    {%- set tile_acc_list = tile_acc_list.append(
+        kernel.slice_nd(acc_list[gemm_idx], [("0", "m_end - m_start"), ("0", "n_end - n_start")])
+    ) %}
+    {%- set tile_Y_list = tile_Y_list.append(
+        kernel.slice_nd(Y_2d_list[gemm_idx], [("m_start", "m_end"), ("n_start", "n_end")])
+    ) %}
+{%- endfor %}
+                    {{ kernel.store_outputs(
+                        tile_Y_list,
+                        tile_acc_list,
+                        GemmOuts,
+                        epilogue_nodes,
+                        offsets=("m_start", "n_start"),
+                        reindexers=reindexers,
+                        multi_output_buffers=multi_output_buffers
+                    )|indent(20, false)
+                    }}
+                }
+            }
+        }
+        {{ micro_gemm.codegen_finalize(kernel) }}
+    }
+}
+"""
+def get_deduplicated_act(act_mapping: dict[int, ir.IRNode]) -> list[ir.IRNode]:
+    act_deduplicated = []
+    act_deduplicated_name: OrderedSet[str] = OrderedSet()
+    for act_idx in range(len(act_mapping.values())):
+        act = act_mapping[act_idx]
+        if act.get_name() not in act_deduplicated_name:
+            act_deduplicated.append(act)
+            act_deduplicated_name.add(act.get_name())
+    return act_deduplicated
+class CppGroupedGemmTemplate(CppGemmTemplate):
+    def __init__(
+        self,
+        input_nodes: list[ir.IRNode],
+        layout: ir.Layout,
+        num_threads: int,
+        register_blocking: GemmBlocking,
+        beta: int = 1,
+        alpha: int = 1,
+        has_bias: bool = False,
+        epilogue_creator: Optional[Callable[[ir.Buffer], ir.Pointwise]] = None,
+        act_mapping: Optional[dict[int, ir.IRNode]] = None,
+        gemm_grouped_num: int = 1,
+    ) -> None:
+        """
+        Template for Group of GEMMs:
+        * Each GEMM has the same dimensions (m, n, k) and the same leading dimensions (lda, ldb, ldc)
+          for their A, B, and C matrices.
+        * Each GEMM has distinct or shared activations, has distinct weight, has unique bias or no bias, has distinct epilogues.
+        * In the current implementation, the outputs of all GEMMs are accumulated using pointwise epilogues.
+          This behavior can be extended in the future if needed.
+        """
+        super().__init__(
+            input_nodes,
+            layout,
+            num_threads,
+            register_blocking,
+            beta,
+            alpha,
+            has_bias,
+            epilogue_creator,
+        )
+        self.act_mapping = act_mapping
+        self.gemm_grouped_num = gemm_grouped_num
+        self.output_node: list[ir.Buffer] = [
+            ir.Buffer(name="buf_out" + str(idx), layout=layout)
+            for idx in range(gemm_grouped_num)
+        ]
+    @classmethod
+    def add_choices(
+        cls,
+        choices: list[ChoiceCaller],
+        layout: ir.Layout,
+        input_nodes: list[ir.IRNode],
+        beta: int = 1,
+        alpha: int = 1,
+        has_bias: tuple[bool, ...] = (False, False),
+        trans_w: bool = False,
+        input_indices: Optional[list[int]] = None,
+        epilogue_creator: Optional[Callable[[ir.Buffer], ir.Pointwise]] = None,
+        act_mapping: Optional[dict[int, ir.IRNode]] = None,  # gemm idx to its act buf
+    ) -> DataProcessorTemplateWrapper:
+        # Input nodes order: x, optional[x1], ... w0, w1, ... optional[b0], optional[b1], ...
+        gemm_grouped_num = len(has_bias)
+        assert act_mapping
+        act_deduplicated = get_deduplicated_act(act_mapping)
+        wgt_start_idx = len(act_deduplicated)
+        bias_start_idx = wgt_start_idx + gemm_grouped_num
+        input_indices = list(range(len(input_nodes)))
+        _T = TypeVar("_T", ir.IRNode, torch.Tensor)
+        _U = TypeVar("_U", ir.Layout, torch.Tensor)
+        def reorder_and_filter(
+            inputs: list[_T],
+            layout_or_out: _U,
+        ) -> tuple[list[_T], _U]:
+            assert input_indices is not None, "input_indices must be set"
+            return [inputs[idx] for idx in input_indices], layout_or_out
+        new_inputs, new_layout = reorder_and_filter(input_nodes, layout)
+        def maybe_to_dense(
+            inputs: list[_T],
+            layout_or_out: _U,
+        ) -> tuple[list[_T], _U]:
+            new_inputs = list(inputs)
+            for idx in range(wgt_start_idx, wgt_start_idx + gemm_grouped_num):
+                if isinstance(inputs[idx], torch.Tensor):
+                    W = inputs[idx]
+                    assert isinstance(W, torch.Tensor), "W must be a torch.Tensor"
+                    new_inputs[idx] = W.to_dense() if W.is_mkldnn else W
+            return new_inputs, layout_or_out
+        def normalize_shapes(
+            inputs: list[_T],
+            layout_or_out: _U,
+        ) -> tuple[list[_T], _U]:
+            new_inputs: list[_T] = list(inputs)
+            if not trans_w:
+                return new_inputs, layout_or_out
+            X = new_inputs[0]
+            for wgt_idx in range(wgt_start_idx, wgt_start_idx + gemm_grouped_num):
+                new_input = new_inputs[wgt_idx]
+                new_inputs[wgt_idx] = transpose_w(new_input, trans_w)
+            for bias_idx in range(bias_start_idx, len(new_inputs)):
+                new_bias = expand_bias(new_inputs[bias_idx], X)
+                assert new_bias is not None
+                new_inputs[bias_idx] = new_bias
+            return new_inputs, layout_or_out
+        num_threads = parallel_num_threads()
+        new_inputs, _ = normalize_shapes(*maybe_to_dense(new_inputs, new_layout))
+        m, n, k, *_ = mm_args(new_inputs[0], new_inputs[wgt_start_idx])
+        output_dtype, compute_dtype = get_gemm_template_output_and_compute_dtype(
+            new_inputs[0].get_dtype()
+        )
+        micro_gemm = create_micro_gemm(
+            "micro_gemm",
+            m,
+            n,
+            k,
+            input_dtype=new_inputs[0].get_dtype(),
+            input2_dtype=new_inputs[wgt_start_idx].get_dtype(),
+            output_dtype=output_dtype,
+            compute_dtype=compute_dtype,
+            alpha=alpha,
+            num_threads=num_threads,
+        )
+        assert micro_gemm is not None
+        _, block_n, _ = micro_gemm.register_blocking
+        new_size, padded_n = cls.get_padded_size(
+            n, block_n, k, should_block_weight=True
+        )
+        padding = padded_n - n
+        def pack_weight(
+            inputs: list[_T],
+            layout_or_out: _U,
+        ) -> tuple[list[_T], _U]:
+            new_W_list = []
+            new_inputs = list(inputs)
+            W_list = new_inputs[wgt_start_idx : wgt_start_idx + gemm_grouped_num]
+            for W in W_list:
+                blocked_w = cls.block_weight(W, new_size, padding)
+                new_W_list.append(cls.pack_vnni_weight(blocked_w, micro_gemm, new_size))
+            new_inputs[wgt_start_idx : wgt_start_idx + gemm_grouped_num] = new_W_list
+            return new_inputs, layout_or_out
+        def preprocessor(
+            inputs: list[_T],
+            layout: _U,
+        ) -> tuple[list[_T], _U]:
+            return pack_weight(
+                *normalize_shapes(*maybe_to_dense(*reorder_and_filter(inputs, layout)))
+            )
+        def postprocessor(output: _T) -> _T:
+            if isinstance(output, ir.TensorBox):
+                template_buffer = ir.InputsKernel.unwrap_storage_for_input(output)
+                assert isinstance(template_buffer, ir.CppTemplateBuffer)
+                new_input_nodes, _ = reorder_and_filter(input_nodes, layout)
+                W_nodes = new_input_nodes[
+                    wgt_start_idx : wgt_start_idx + gemm_grouped_num
+                ]
+                W_tensor = []
+                for W_node in W_nodes:
+                    assert W_node.get_name() in V.graph.constants
+                    W_tensor.append(V.graph.constants[W_node.get_name()])
+                new_input_nodes[wgt_start_idx : wgt_start_idx + gemm_grouped_num] = (
+                    W_tensor  # type: ignore[assignment]
+                )
+                new_input_nodes, _ = pack_weight(
+                    *normalize_shapes(*maybe_to_dense(new_input_nodes, layout))
+                )
+                # Prune unused tensors
+                prune_tensors(input_nodes, new_input_nodes)
+                for idx in range(wgt_start_idx, wgt_start_idx + gemm_grouped_num):
+                    W_packed = new_input_nodes[idx]
+                    assert isinstance(W_packed, torch.Tensor)
+                    W_packed_constant = V.graph.add_tensor_constant(W_packed)
+                    template_buffer.inputs[idx] = (
+                        ir.InputsKernel.unwrap_storage_for_input(W_packed_constant)
+                    )
+            return output
+        template = DataProcessorTemplateWrapper(
+            CppGroupedGemmTemplate,
+            preprocessor,
+            postprocessor,
+            input_nodes=input_nodes,
+            layout=layout,
+            num_threads=num_threads,
+            register_blocking=micro_gemm.register_blocking,
+            beta=beta,
+            alpha=alpha,
+            has_bias=has_bias,
+            epilogue_creator=epilogue_creator,
+            act_mapping=act_mapping,
+            gemm_grouped_num=gemm_grouped_num,
+        )
+        template.maybe_append_choice(choices)
+        return template
+    def render(  # type: ignore[override,return,no-untyped-def]
+        self,
+        kernel: CppTemplateKernel,
+        template_buffer_node: Optional[ir.CppTemplateBuffer] = None,
+        flag_template_buffer_has_other_users: Optional[bool] = None,
+        epilogue_nodes: Optional[list[ir.IRNode]] = None,
+        **kwargs,
+    ) -> str:
+        assert self.act_mapping
+        act_deduplicated = get_deduplicated_act(self.act_mapping)
+        wgt_start_idx = len(act_deduplicated)
+        bias_start_idx = wgt_start_idx + self.gemm_grouped_num
+        X_list = list(self.act_mapping.values())
+        W_list = self.input_nodes[wgt_start_idx : wgt_start_idx + self.gemm_grouped_num]
+        inp_list = []
+        cur_idx = bias_start_idx
+        for inp_idx in range(self.gemm_grouped_num):
+            inp = None
+            if self.has_bias[inp_idx]:
+                inp = self.input_nodes[cur_idx]
+                cur_idx += 1
+            inp_list.append(inp)
+        Y_list = self.output_node
+        multi_output_buffers = None
+        if template_buffer_node is not None:
+            W_list = template_buffer_node.inputs[
+                wgt_start_idx : wgt_start_idx + self.gemm_grouped_num
+            ]
+            assert isinstance(template_buffer_node.outputs, list)
+            Y_list = template_buffer_node.outputs
+            counters["inductor"]["cpp_grouped_gemm_template"] += 1
+            multi_output_buffers = template_buffer_node.outputs
+        template_buffer = Y_list[0]
+        fake_buffers: list[ir.Buffer] = []
+        Y_2d_list = Y_list
+        output_dtype, compute_dtype = get_gemm_template_output_and_compute_dtype(
+            X_list[0].get_dtype()
+        )
+        micro_gemm = create_micro_gemm(
+            f"{kernel.kernel_name}_micro_gemm",
+            self.m,
+            self.n,
+            self.k,
+            input_dtype=X_list[0].get_dtype(),
+            input2_dtype=W_list[0].get_dtype(),
+            output_dtype=output_dtype,
+            compute_dtype=compute_dtype,
+            alpha=self.alpha,
+            num_threads=self.num_threads,
+        )
+        assert micro_gemm is not None
+        assert self.register_blocking == micro_gemm.register_blocking
+        self.log_blockings()
+        if isinstance(micro_gemm, CppMicroGemmAMX):
+            counters["inductor"]["cpp_micro_gemm_amx_counter"] += 1
+        L1_cache_size = torch._C._cpu._L1d_cache_size()  # per core cache size in Bytes
+        assert L1_cache_size > 0, f"Expect L1_cache_size > 0 but got {L1_cache_size}"
+        L2_cache_size = torch._C._cpu._L2_cache_size()  # per core cache size in Bytes
+        assert L2_cache_size > 0, f"Expect L2_cache_size > 0 but got {L2_cache_size}"
+        epilogues: list[ir.IRNode] = []
+        reindexers: list[Optional[Callable[[list[Any]], list[Any]]]] = []
+        gemm_output_buffers: list[ir.Buffer] = []
+        for out_buf_idx in range(self.gemm_grouped_num):
+            gemm_output_name = f"{template_buffer.get_name()}_GemmOut" + str(
+                out_buf_idx
+            )
+            gemm_output_buffers.append(
+                ir.Buffer(name=gemm_output_name, layout=template_buffer.layout)
+            )
+        assert not self.epilogue_creator, (
+            "epilogue_creator is not supported yet in Grouped GEMM Template"
+        )
+        kernel_args: dict[str, Optional[ir.IRNode]] = {}
+        for x_idx in range(wgt_start_idx):
+            kernel_args["X" + str(x_idx)] = act_deduplicated[x_idx]
+        for w_idx in range(self.gemm_grouped_num):
+            kernel_args["W" + str(w_idx)] = W_list[w_idx]
+        for inp_idx in range(self.gemm_grouped_num):
+            kernel_args["inp" + str(inp_idx)] = inp_list[inp_idx]
+        def _bias_add_epilogue(buf: ir.IRNode, inp: ir.IRNode) -> ir.Pointwise:
+            return create_epilogue_with_attr(
+                buf, "bias_add", other=inp, beta=self.beta, dtype=self.layout.dtype
+            )
+        for gemm_idx, inp in enumerate(inp_list):
+            if inp:
+                buffer_name = Y_list[gemm_idx].get_name()
+                epilogues.append(
+                    ir.ComputedBuffer(
+                        name=buffer_name,
+                        layout=template_buffer.layout,
+                        data=_bias_add_epilogue(gemm_output_buffers[gemm_idx], inp),
+                    )
+                )
+                reindexers.append(None)
+        if epilogue_nodes:
+            epilogues.extend(epilogue_nodes)
+            for epilogue_node in epilogue_nodes:
+                Y = cast(ir.Buffer, epilogue_node)
+                _, reindexers = gen_2d_view_of_epilogue_buf(
+                    Y,
+                    template_buffer,
+                    [
+                        epilogue_node,
+                    ],
+                    reindexers,
+                    default_reindexers=[
+                        None,
+                    ],
+                )
+        options = dict(
+            N=self.n,
+            K=self.k,
+            PADDED_N=self.padded_n,
+            aliases={},
+            beta=self.beta,
+            alpha=self.alpha,
+            num_threads=self.num_threads,
+            micro_gemm=micro_gemm,
+            is_dynamic_M=self.is_dynamic_M,
+            template=self,
+            kernel=kernel,
+            export_declaration=get_export_declaration(),
+            acc_buf_dtype=torch.float,
+            DTYPE_TO_CPP=DTYPE_TO_CPP,
+            L1_cache_size=L1_cache_size,
+            L2_cache_size=L2_cache_size,
+            config=config,
+            epilogue_nodes=epilogues,
+            GemmOuts=gemm_output_buffers,
+            reindexers=reindexers,
+            kernel_args=kernel_args,
+            X_list=X_list,
+            W_list=W_list,
+            gemm_grouped_num=self.gemm_grouped_num,
+            Y_list={"Y" + str(idx): Y for idx, Y in enumerate(Y_list)},
+            Y_2d_list=Y_2d_list,
+            multi_output_buffers=multi_output_buffers,
+        )
+        with contextlib.ExitStack() as stack:
+            stack.enter_context(
+                patch.object(V.graph, "get_dtype", self._fake_get_dtype(fake_buffers))
+            )
+            return self._template_from_string(GEMM_TEMPLATE).render(**options)

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_micro_gemm.py ADDED Viewed

	@@ -0,0 +1,2011 @@

+# mypy: allow-untyped-defs
+import dataclasses
+import operator
+import sys
+from enum import Enum
+from typing import Callable, Optional
+import torch
+from .. import cpp_builder, ir
+from ..cpu_vec_isa import (
+    pick_vec_isa,
+    VecAMX,
+    VecAVX2,
+    VecAVX512,
+    VecISA,
+    VecNEON,
+    VecSVE256,
+)
+from ..utils import IndentedBuffer, parallel_num_threads
+from ..virtualized import V
+from .common import KernelTemplate
+from .cpp_template_kernel import CppTemplateKernel
+from .cpp_utils import DTYPE_TO_CPP, GemmBlocking, value_to_cpp
+class LayoutType(Enum):
+    NORMAL = 0
+    VNNI2 = 1
+    VNNI4 = 2
+_IS_WINDOWS = sys.platform == "win32"
+def get_restrict_keyword() -> str:
+    if _IS_WINDOWS:
+        # https://learn.microsoft.com/en-us/cpp/cpp/extension-restrict?view=msvc-170
+        return "__restrict"
+    else:
+        return "__restrict__"
+class CppMicroGemm:
+    """
+    A class that codegens a kernel that computes small-sized matrix multiplication.
+    A micro GEMM kernel is responsible for register blocking, instruction selection,
+    and other CPU architecture-specific optimizations.
+    The subclasses need to override `codegen_define` to define the kernel function
+    that is called by the code generated by `codegen_call`.
+    """
+    # TODO(jgong5): support constant shapes and lds as template args.
+    DECLARE_KERNEL = r"""
+template <bool accum, bool prefetch=false>
+inline void {{kernel_name}}(
+{%- if kernel_extra_args_declare %}
+    {{kernel_extra_args_declare}}
+{%- endif %}
+    const {{input_t}}* {{restrict_keyword}} A,
+    const {{input2_t}}* {{restrict_keyword}} B,
+    {{output_t}}* {{restrict_keyword}} C,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc
+)
+"""
+    def __init__(
+        self,
+        name,
+        input_dtype,
+        input2_dtype,
+        output_dtype,
+        compute_dtype,
+        register_blocking,
+        alpha=1,
+    ) -> None:
+        self.name = name
+        self.input_dtype = input_dtype
+        assert input2_dtype is not None
+        self.input2_dtype = input2_dtype
+        self.output_dtype = output_dtype
+        self.compute_dtype = compute_dtype
+        self.register_blocking = register_blocking
+        self.alpha = alpha
+        self.pack_vnni_B_locally = False
+    def get_common_options(self):
+        if self.input_dtype in [torch.uint8, torch.int8]:
+            assert self.compute_dtype == torch.int32
+            assert self.output_dtype == torch.int32
+            assert self.input2_dtype == torch.int8
+        return {
+            "torch": torch,
+            "kernel_name": self.name,
+            "input_dtype": self.input_dtype,
+            "input2_dtype": self.input2_dtype,
+            "output_dtype": self.output_dtype,
+            "compute_dtype": self.compute_dtype,
+            "input_t": DTYPE_TO_CPP[self.input_dtype],
+            "input2_t": DTYPE_TO_CPP[self.input2_dtype],
+            "output_t": DTYPE_TO_CPP[self.output_dtype],
+            "compute_t": DTYPE_TO_CPP[self.compute_dtype],
+            "alpha": self.alpha,
+            "kernel_extra_args_declare": self.get_kernel_extra_args_declare(),
+            "int8_gemm": self.input_dtype in [torch.uint8, torch.int8],
+            "vnni_size": 4 if self.input_dtype in [torch.uint8, torch.int8] else 2,
+            "restrict_keyword": get_restrict_keyword(),
+            "pack_vnni_B_locally": self.pack_vnni_B_locally,
+            "template": self,
+            "is_woq_int4": self.is_woq_int4(),
+        }
+    def get_kernel_declaration(self):
+        options = self.get_common_options()
+        return KernelTemplate._template_from_string(self.DECLARE_KERNEL).render(options)
+    def get_kernel_extra_args_declare(self) -> str:
+        return ""
+    def get_kernel_extra_args(self, **kwargs) -> list[str]:
+        return []
+    def codegen_define(self, kernel: CppTemplateKernel) -> str:
+        raise NotImplementedError
+    def codegen_call(
+        self,
+        kernel: CppTemplateKernel,
+        A: ir.Buffer,
+        B: ir.Buffer,
+        C: ir.Buffer,
+        accum: bool,
+        prefetch: bool = False,
+        **kwargs_for_extra_args,
+    ) -> str:
+        """
+        Generate the code for calling the templated kernel that computes
+        `C += alpha * A @ B` if `accum` is True, or `C = alpha * A @ B` otherwise.
+        """
+        A_ptr = f"&({kernel.index(A, [0, 0])})"
+        B_ptr = f"&({kernel.index(B, [0, 0])})"
+        C_ptr = f"&({kernel.index(C, [0, 0])})"
+        M = kernel.size(C, 0)
+        N = kernel.size(C, 1)
+        K = kernel.size(A, 1)
+        lda = kernel.stride(A, 0)
+        ldb = kernel.stride(B, 0)
+        ldc = kernel.stride(C, 0)
+        res = IndentedBuffer()
+        res.writeline(
+            f"{self.name}<{value_to_cpp(accum, 'bool')}, {value_to_cpp(prefetch, 'bool')}>("
+        )
+        with res.indent():
+            kwargs_for_extra_args.update({"kernel": kernel})
+            extra_args = self.get_kernel_extra_args(**kwargs_for_extra_args)
+            for arg in extra_args:
+                res.writeline(arg)
+            res.writeline(f"{A_ptr},")
+            res.writeline(f"{B_ptr},")
+            res.writeline(f"{C_ptr},")
+            res.writeline(f"{M},")
+            res.writeline(f"{N},")
+            res.writeline(f"{K},")
+            res.writeline(f"{lda},")
+            res.writeline(f"{ldb},")
+            res.writeline(f"{ldc}")
+        res.writeline(");")
+        return res.getvalue()
+    def use_local_vnni_blocking(self, should_block_weight: bool):
+        self.pack_vnni_B_locally = should_block_weight
+    def codegen_init(
+        self,
+        kernel: CppTemplateKernel,
+    ) -> str:
+        return ""
+    def codegen_finalize(
+        self,
+        kernel: CppTemplateKernel,
+    ) -> str:
+        return ""
+    def get_b_layout(self) -> LayoutType:
+        return LayoutType.NORMAL
+    ALLOCATE_WEIGHT_BUFFER = r"""
+    {%- if is_msvc_compiler %}
+    // MSVC doesn't support stack-allocated dynamic-sized arrays, so using heap memory here.
+    std::unique_ptr<{{buffer_dtype}}[]> heap_deq_b_buf_ptr(new {{buffer_dtype}}[{{buffer_size}}]);
+    {{buffer_dtype}}* {{buffer_name}} = heap_deq_b_buf_ptr.get();
+    {%- else %}
+    // It's safe to use a stack-allocated array since the blocking strategy would
+    // require us to allocate an array that's smaller than the size of L1D cache,
+    // and the default per thread max stack size on Linux is quite higher,
+    // so we need not worry about stack overflow.
+    alignas(4096) {{buffer_dtype}} {{buffer_name}}[{{buffer_size}}];
+    {%- endif %}
+"""
+    def codegen_allocate_weight_buffer(
+        self, buffer_name: str, buffer_dtype: str, *size_args
+    ) -> str:
+        buffer_size = " * ".join(map(str, size_args))
+        return KernelTemplate._template_from_string(self.ALLOCATE_WEIGHT_BUFFER).render(
+            dict(
+                buffer_name=buffer_name,
+                buffer_dtype=buffer_dtype,
+                buffer_size=buffer_size,
+                is_msvc_compiler=cpp_builder.is_msvc_cl(),
+            )
+        )
+    def is_woq_int4(self):
+        return False
+@dataclasses.dataclass
+class CppMicroGemmConfig:
+    input_dtype: torch.dtype
+    input2_dtype: torch.dtype
+    output_dtype: torch.dtype
+    compute_dtype: torch.dtype
+    vec_isa_cls: type[VecISA]
+    register_blocking: GemmBlocking
+    extra_check: Optional[Callable[..., bool]] = None
+micro_gemm_configs: dict[type[CppMicroGemm], list[CppMicroGemmConfig]] = {}
+def register_micro_gemm(*configs):
+    def inner(cls):
+        assert cls not in micro_gemm_configs, (
+            f"Duplicate micro_gemm registration for {cls}"
+        )
+        assert len(configs) > 0, f"No micro_gemm configs provided for {cls}"
+        micro_gemm_configs[cls] = list(configs)
+        return cls
+    return inner
+def generate_gemm_config(
+    vec_isa_cls,
+    register_blockings,
+    input_dtype=torch.float,
+    input2_dtype=None,
+    output_dtype=None,
+    compute_dtype=None,
+    extra_check=None,
+):
+    if output_dtype is None:
+        output_dtype = input_dtype
+    if compute_dtype is None:
+        compute_dtype = output_dtype
+    if input2_dtype is None:
+        input2_dtype = input_dtype
+    return [
+        CppMicroGemmConfig(
+            input_dtype,
+            input2_dtype,
+            output_dtype,
+            compute_dtype,
+            vec_isa_cls,
+            GemmBlocking(*blocking),
+            extra_check,
+        )
+        for blocking in register_blockings
+    ]
+class CppMicroGemmRef(CppMicroGemm):
+    """
+    A reference implementation of the CppMicroGemm class with naive C++ code.
+    It is used for correctness debugging.
+    """
+    TEMPLATE_ENTRY = r"""
+{{declare_kernel}} {
+    for (int64_t m = 0; m < M; ++m) {
+        for (int64_t n = 0; n < N; ++n) {
+            {{compute_t}} result = accum ? C[m * ldc + n] : 0;
+            for (int64_t k = 0; k < K; ++k) {
+                result += ({{compute_t}})A[m * lda + k] * ({{compute_t}})B[k * ldb + n] * {{alpha}};
+            }
+            C[m * ldc + n] = result;
+        }
+    }
+}
+"""
+    def __init__(
+        self, name, input_dtype, input2_dtype, output_dtype, compute_dtype, alpha
+    ) -> None:
+        super().__init__(
+            name,
+            input_dtype,
+            input2_dtype,
+            output_dtype,
+            compute_dtype,
+            GemmBlocking(1, 1, 1),
+            alpha,
+        )
+    def codegen_define(self, kernel: CppTemplateKernel) -> str:
+        options = {
+            "declare_kernel": self.get_kernel_declaration(),
+            **self.get_common_options(),
+        }
+        return KernelTemplate._template_from_string(self.TEMPLATE_ENTRY).render(options)
+def is_int8_woq_gemm_small_m_dim_corner_case(config, m, n, k):
+    return (
+        k % config.register_blocking.block_k == 0
+        and n % config.register_blocking.block_n == 0
+        and m < 16
+    )
+# extra check for small M dimension for int8 WoQ case
+def check_int8_woq_small_m_dim(config, m, n, k, alpha, num_threads, **kwargs):
+    return is_int8_woq_gemm_small_m_dim_corner_case(config, m, n, k) and not kwargs.get(
+        "dynamic_M", False
+    )
+# For int8 WoQ GEMM with small M, we use different blockings that shouldn't be used otherwise
+def do_not_use_with_small_m_for_int8_woq(config, m, n, k, alpha, num_threads, **kwargs):
+    return not check_int8_woq_small_m_dim(config, m, n, k, alpha, num_threads, **kwargs)
+@register_micro_gemm(
+    *generate_gemm_config(
+        VecAVX512,
+        [(8, 48, 1), (8, 32, 1), (16, 16, 1)],
+        input_dtype=torch.float,
+    ),
+    *generate_gemm_config(
+        VecAVX512,
+        [(8, 48, 1), (8, 32, 1), (16, 16, 1)],
+        input_dtype=torch.bfloat16,
+        output_dtype=torch.float,
+    ),
+    *generate_gemm_config(
+        VecAVX512,
+        [(8, 48, 1), (8, 32, 1), (16, 16, 1)],
+        input_dtype=torch.half,
+        output_dtype=torch.float,
+    ),
+    *generate_gemm_config(
+        VecAVX512,
+        [(8, 48, 1), (8, 32, 1), (16, 16, 1)],
+        input_dtype=torch.bfloat16,
+        input2_dtype=torch.int8,
+        output_dtype=torch.float,
+        compute_dtype=torch.float,
+        extra_check=do_not_use_with_small_m_for_int8_woq,
+    ),
+    *generate_gemm_config(
+        VecAVX512,
+        [
+            (4, 32, 64),
+            (8, 32, 64),
+        ],
+        input_dtype=torch.bfloat16,
+        input2_dtype=torch.int8,
+        output_dtype=torch.float,
+        compute_dtype=torch.float,
+        extra_check=check_int8_woq_small_m_dim,
+    ),
+    *generate_gemm_config(
+        VecAVX2,
+        [(4, 24, 1), (4, 16, 1), (8, 8, 1)],
+        input_dtype=torch.float,
+    ),
+    *generate_gemm_config(
+        VecAVX2,
+        [(4, 24, 1), (4, 16, 1), (8, 8, 1)],
+        input_dtype=torch.bfloat16,
+        output_dtype=torch.float,
+    ),
+    *generate_gemm_config(
+        VecAVX2,
+        [(4, 24, 1), (4, 16, 1), (8, 8, 1)],
+        input_dtype=torch.half,
+        output_dtype=torch.float,
+    ),
+    *generate_gemm_config(
+        VecAVX2,
+        [(4, 24, 1), (4, 16, 1), (8, 8, 1)],
+        input_dtype=torch.bfloat16,
+        input2_dtype=torch.int8,
+        output_dtype=torch.float,
+        compute_dtype=torch.float,
+        extra_check=do_not_use_with_small_m_for_int8_woq,
+    ),
+    *generate_gemm_config(
+        VecAVX2,
+        [
+            (2, 16, 64),
+            (4, 16, 64),
+        ],
+        input_dtype=torch.bfloat16,
+        input2_dtype=torch.int8,
+        output_dtype=torch.float,
+        compute_dtype=torch.float,
+        extra_check=check_int8_woq_small_m_dim,
+    ),
+    *generate_gemm_config(
+        VecNEON,
+        [(4, 24, 1), (4, 16, 1), (8, 8, 1)],
+        input_dtype=torch.float,
+        input2_dtype=torch.float,
+        output_dtype=torch.float,
+        compute_dtype=torch.float,
+    ),
+    *generate_gemm_config(
+        VecSVE256,
+        [(4, 24, 1), (4, 16, 1), (8, 8, 1)],
+        input_dtype=torch.float,
+        input2_dtype=torch.float,
+        output_dtype=torch.float,
+        compute_dtype=torch.float,
+    ),
+)
+class CppMicroGemmFP32Vec(CppMicroGemm):
+    """
+    This class generates the code for micro gemm using fp32 vec instructions for compute.
+    It supports input types of torch.float, torch.bfloat16, and torch.half with fp32 output.
+    The output of the microkernel is in FP32, but it would be converted to BF16/FP16 in the template,
+    if the desired output is BF16/FP16.
+    """
+    TEMPLATE_ENTRY = r"""
+{{declare_kernel}} {
+    using Vectorized = at::vec::Vectorized<{{compute_t}}>;
+    constexpr auto VLEN = Vectorized::size();
+    {{kernel.assert_function}}({{block_n}} % VLEN == 0, "block_n dimension must be multiple of Vector size");
+    {{kernel.assert_function}}(K % {{block_k}} == 0, "K dimension must be multiple of {{block_k}}");
+    // TODO(jgong5): loop unroll for M and N
+    for (int64_t m = 0; m < M; m += {{block_m}}) {
+        int64_t block_m = std::min<int64_t>(M - m, {{block_m}});
+        for (int64_t n = 0; n < N; n += {{block_n}}) {
+            int64_t block_n = std::min<int64_t>(N - n, {{block_n}});
+            if (block_m == {{block_m}} && block_n == {{block_n}}) {
+{%- if not trans_b %}
+                {{kernel_name}}_kernel<{{block_m}}, {{block_n}}, accum, prefetch>(
+{%- else %}
+                {{kernel_name}}_transpose_b_kernel<{{block_m}}, {{block_n}}, accum, prefetch>(
+{%- endif %}
+                    A + m * lda,
+{%- if not trans_b %}
+                    B + n,
+{%- else %}
+                    B + n * ldb,
+{%- endif %}
+                    C + m * ldc + n,
+                    K,
+                    lda,
+                    ldb,
+                    ldc
+                );
+{%- if tail_n %}
+            } else if (block_n == {{block_n}}){
+{%- else %}
+            } else {
+{%- endif %}
+                switch (block_m) {
+{%- for b in range(block_m - 1, 0, -1) %}
+                case {{b}}:
+    {%- if not trans_b %}
+                    {{kernel_name}}_kernel<{{b}}, {{block_n}}, accum, prefetch>(
+    {%- else %}
+                    {{kernel_name}}_transpose_b_kernel<{{b}}, {{block_n}}, accum, prefetch>(
+    {%- endif %}
+                        A + m * lda,
+    {%- if not trans_b %}
+                        B + n,
+    {%- else %}
+                        B + n * ldb,
+    {%- endif %}
+                        C + m * ldc + n,
+                        K,
+                        lda,
+                        ldb,
+                        ldc
+                    );
+                    break;
+{%- endfor %}
+                default:
+                    {{kernel.assert_function}}(false, "Unsupported block_m: {{block_m}}");
+                }
+{%- if tail_n %}
+            } else {
+                switch (block_m) {
+    {%- for b in range(block_m, 0, -1) %}
+                case {{b}}:
+        {%- if not trans_b %}
+                    {{kernel_name}}_ntail_kernel<{{b}}, {{block_n}}, accum, prefetch>(
+        {%- else %}
+                    {{kernel_name}}_ntail_transpose_b_kernel<{{b}}, {{block_n}}, accum, prefetch>(
+        {%- endif %}
+                        A + m * lda,
+        {%- if not trans_b %}
+                        B + n,
+        {%- else %}
+                        B + n * ldb,
+        {%- endif %}
+                        C + m * ldc + n,
+                        block_n,
+                        K,
+                        lda,
+                        ldb,
+                        ldc
+                    );
+                    break;
+    {%- endfor %}
+                default:
+                    {{kernel.assert_function}}(false, "Unsupported block_m: {{block_m}}");
+                }
+            }
+{%- else %}
+            }
+{%- endif %}
+        }
+    }
+}
+"""
+    TEMPLATE_KERNEL = r"""
+template <int64_t BLOCK_M, int64_t BLOCK_N, bool accum, bool prefetch=false>
+{%- if not trans_b %}
+    {%- if tail_n %}
+inline void {{kernel_name}}_ntail_kernel(
+    {%- else %}
+inline void {{kernel_name}}_kernel(
+    {%- endif %}
+{%- else %}
+    {%- if tail_n %}
+inline void {{kernel_name}}_ntail_transpose_b_kernel(
+    {%- else %}
+inline void {{kernel_name}}_transpose_b_kernel(
+    {%- endif %}
+{%- endif %}
+    const {{input_t}}* {{restrict_keyword}} A,
+    const {{input2_t}}* {{restrict_keyword}} B,
+    {{output_t}}* {{restrict_keyword}} C,
+{%- if tail_n %}
+    int64_t N,
+{%- endif %}
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc
+) {
+    using Vectorized = at::vec::Vectorized<{{compute_t}}>;
+{%- if input2_dtype in [torch.bfloat16, torch.float16] %}
+    using VectorizedIn = at::vec::Vectorized<{{input_t}}>;
+{%- endif %}
+{%- if not trans_b %}
+    constexpr auto VLEN = Vectorized::size();
+    constexpr auto ROWS = BLOCK_M;
+    constexpr auto COLS = BLOCK_N / VLEN;
+    Vectorized va;
+    at::vec::VectorizedN<{{compute_t}}, COLS> vb;
+    at::vec::VectorizedN<{{compute_t}}, ROWS*COLS> vc;
+    {%- if tail_n %}
+    int64_t rCOLS = (N + VLEN - 1) / VLEN;
+    int ntail = N % VLEN;
+    {%- endif %}
+    auto loadc = [&](auto i) {
+        if constexpr (accum) {
+            constexpr int row = i / COLS;
+            constexpr int col = i % COLS;
+    {%- if tail_n %}
+            int load_size = (col == rCOLS - 1 && ntail != 0) ? ntail : VLEN;
+            if (col < rCOLS) {
+                vc[i] = Vectorized::loadu(C + row * ldc + col * VLEN, load_size);
+            }
+    {%- else %}
+            vc[i] = Vectorized::loadu(C + row * ldc + col * VLEN);
+    {%- endif %}
+        } else {
+            vc[i] = Vectorized(0.0f);
+        }
+    };
+    c10::ForcedUnroll<ROWS * COLS>{}(loadc);
+    auto compute = [&, COLS](auto i, int k) {
+        constexpr int row = i / COLS;
+        constexpr int col = i % COLS;
+    {%- if tail_n %}
+        int load_size = (col == rCOLS - 1 && ntail != 0) ? ntail : VLEN;
+    {%- endif %}
+        if constexpr (col == 0) {
+    {%- if alpha != 1 %}
+            va = Vectorized(static_cast<{{compute_t}}>(A[row * lda + k]) * {{alpha}});
+    {%- else %}
+            va = Vectorized(static_cast<{{compute_t}}>(A[row * lda + k]));
+    {%- endif %}
+        }
+        if constexpr (row == 0) {
+    {%- if tail_n %}
+            if (col < rCOLS) {
+        {%- if input2_dtype in [torch.bfloat16, torch.float16] %}
+                auto b = VectorizedIn::loadu(B + k * ldb + col * VLEN, load_size);
+                vb[col] = at::vec::convert<{{compute_t}}>(b);
+        {%- elif input2_dtype == torch.int8 %}
+            // Convert VLEN int8 elements to int32, and then fp32
+                auto b32 = at::vec::convert_to_int32<int8_t>(B + k * ldb + col * VLEN, load_size);
+                vb[col] = at::vec::convert<float>(b32);
+        {%- else %}
+                vb[col] = Vectorized::loadu(B + k * ldb + col * VLEN, load_size);
+        {%- endif %}
+            } else {
+                vb[col] = Vectorized(0.0f);
+            }
+    {%- else %}
+        {%- if input2_dtype in [torch.bfloat16, torch.float16] %}
+            auto b = VectorizedIn::loadu(B + k * ldb + col * VLEN, VLEN);
+            vb[col] = at::vec::convert<{{compute_t}}>(b);
+        {%- elif input2_dtype == torch.int8 %}
+            // Convert VLEN int8 elements to int32, and then fp32
+            auto b32 = at::vec::convert_to_int32<int8_t>(B + k * ldb + col * VLEN);
+            if constexpr (prefetch) {
+              _mm_prefetch(B + (k + {{block_k}}) * ldb + col * VLEN, _MM_HINT_T0);
+            }
+            vb[col] = at::vec::convert<float>(b32);
+        {%- else %}
+            vb[col] = Vectorized::loadu(B + k * ldb + col * VLEN);
+        {%- endif %}
+    {%- endif %}
+        }
+        constexpr int idx = row * COLS + col;
+    {%- if tail_n %}
+        if (col < rCOLS) {
+            vc[idx] = at::vec::fmadd(va, vb[col], vc[idx]);
+        }
+    {%- else %}
+        vc[idx] = at::vec::fmadd(va, vb[col], vc[idx]);
+    {%- endif %}
+    };
+    for (int k = 0; k < K; ++k) {
+        c10::ForcedUnroll<ROWS * COLS>{}(compute, k);
+    }
+    // store to C
+    auto storec = [&](auto i) {
+        constexpr int row = i / COLS;
+        constexpr int col = i % COLS;
+    {%- if tail_n %}
+        int store_size = (col == rCOLS - 1 && ntail != 0) ? ntail : VLEN;
+        if (col < rCOLS) {
+            vc[i].store(C + row * ldc + col * VLEN, store_size);
+        }
+    {%- else %}
+        vc[i].store(C + row * ldc + col * VLEN);
+    {%- endif %}
+    };
+    c10::ForcedUnroll<ROWS * COLS>{}(storec);
+{%- else %}
+    // Use 2 implementations for the transposed B:
+    // First implementation:
+    //   Transpose first and then perform outer product calculation in sub-blocks,
+    //   which introduces an additional transpose overhead of [K, N] compared to the non-transpose version.
+    // Second implementation:
+    //   Directly perform inner product calculation in sub-blocks,
+    //   which introduces an additional vector reduction of [M, N] compared to the non-tranpose version.
+    // Therefore, when M * N / (K * N) is large, the first implementation has better performance.
+    {%- if tail_n %}
+    if (K % Vectorized::size() == 0 && N % Vectorized::size() == 0 && 24 * BLOCK_M > K) {
+    {%- else %}
+    if (K % Vectorized::size() == 0 && 24 * BLOCK_M > K) {
+    {%- endif %}
+        // First implementation:
+        constexpr auto VLEN = Vectorized::size();
+        constexpr auto ROWS = BLOCK_M;
+        constexpr auto COLS = BLOCK_N / VLEN;
+        int _K = K / VLEN;
+        Vectorized va;
+        at::vec::VectorizedN<{{compute_t}}, VLEN> vb;
+        at::vec::VectorizedN<{{compute_t}}, ROWS*COLS> vc;
+        auto loadc = [&](auto i) {
+            if constexpr (accum) {
+                constexpr int row = i / COLS;
+                constexpr int col = i % COLS;
+                vc[i] = Vectorized::loadu(C + row * ldc + col * VLEN);
+            } else {
+                vc[i] = Vectorized(0.0f);
+            }
+        };
+        c10::ForcedUnroll<ROWS * COLS>{}(loadc);
+        auto unroll_loadB = [&](auto i, const {{input2_t}}* {{restrict_keyword}} src_ptr) {
+    {%- if input2_dtype in [torch.bfloat16, torch.float16] %}
+            auto b = VectorizedIn::loadu(src_ptr + i * ldb, VLEN);
+            vb[i] = at::vec::convert<{{compute_t}}>(b);
+    {%- elif input2_dtype == torch.int8 %}
+            auto b32 = at::vec::convert_to_int32<int8_t>(src_ptr + i * ldb, VLEN);
+            vb[i] = at::vec::convert<float>(b32);
+    {%- else %}
+            vb[i] = Vectorized::loadu(src_ptr + i * ldb, VLEN);
+    {%- endif %}
+        };
+        auto compute_trans = [&, COLS](auto i, int k) {
+            constexpr int row = i % ROWS;
+            constexpr int col = i / ROWS;
+            constexpr int e_col = col * VLEN;
+            int idk = k * VLEN;
+            if constexpr (row == 0) {
+                c10::ForcedUnroll<VLEN>{}(unroll_loadB, B + e_col * ldb + idk);
+                at::vec::transpose_block(vb);
+            }
+            constexpr int idx = row * COLS + col;
+            {{kernel.unroll_pragma(16)}}
+            for (int j = 0; j < VLEN; j++) {
+    {%- if alpha != 1 %}
+                va = Vectorized(static_cast<{{compute_t}}>(A[row * lda + idk + j]) * {{alpha}});
+    {%- else %}
+                va = Vectorized(static_cast<{{compute_t}}>(A[row * lda + idk + j]));
+    {%- endif %}
+                vc[idx] = at::vec::fmadd(va, vb[j], vc[idx]);
+            }
+        };
+        for (int k = 0; k < _K; ++k) {
+            c10::ForcedUnroll<ROWS * COLS>{}(compute_trans, k);
+        }
+        // store to C
+        auto storec = [&](auto i) {
+            constexpr int row = i / COLS;
+            constexpr int col = i % COLS;
+            vc[i].store(C + row * ldc + col * VLEN);
+        };
+        c10::ForcedUnroll<ROWS * COLS>{}(storec);
+    } else {
+        // Second implementation
+    {%- if input2_dtype in [torch.bfloat16, torch.float16] %}
+        constexpr auto VLEN = VectorizedIn::size();
+    {%- else %}
+        constexpr auto VLEN = Vectorized::size();
+    {%- endif %}
+        int _K = (K + VLEN - 1) / VLEN;
+        // sub-block size of BLOCK_N and BLOCK_M
+        constexpr int sM = {{sub_block_m}};
+        constexpr int sN = {{sub_block_n}};
+    {%- if tail_n %}
+        int bN = (N + sN - 1) / sN;
+    {%- else %}
+        constexpr int bN = (BLOCK_N + sN - 1) / sN;
+    {%- endif %}
+        constexpr int bM = (BLOCK_M + sM - 1) / sM;
+    {%- if input2_dtype in [torch.bfloat16, torch.float16] %}
+        at::vec::VectorizedN<{{compute_t}}, 2> va;
+        at::vec::VectorizedN<{{compute_t}}, 2 * sN> vb;
+    {%- else %}
+        at::vec::Vectorized<{{compute_t}}> va;
+        at::vec::VectorizedN<{{compute_t}}, sN> vb;
+    {%- endif %}
+        at::vec::VectorizedN<{{compute_t}}, sN * sM> vmid;
+    {%- if tail_n %}
+        int ntail = N % sN;
+    {%- else %}
+        constexpr int ntail = BLOCK_N % sN;
+    {%- endif %}
+        constexpr int mtail = BLOCK_M % sM;
+        int ktail = K % VLEN;
+        auto compute_trans = [&](int m, int n, int k) {
+    {%- if tail_n %}
+            int e_n = (n == bN - 1 && ntail != 0) ? (N - n * sN) : sN;
+    {%- else %}
+            int e_n = (n == bN - 1 && ntail != 0) ? (BLOCK_N - n * sN) : sN;
+    {%- endif %}
+            int e_m = (m == bM - 1 && mtail != 0) ? (BLOCK_M - m * sM) : sM;
+            int e_k = (k == _K - 1 && ktail != 0) ? (K - k * VLEN) : VLEN;
+            {{kernel.unroll_pragma(sub_block_n)}}
+            for (int i = 0; i < e_n; i++) {
+    {%- if input2_dtype in [torch.bfloat16, torch.float16] %}
+                auto b = VectorizedIn::loadu(B + (sN * n + i) * ldb + k * VLEN, e_k);
+                std::tie(vb[2 * i], vb[2 * i + 1]) = at::vec::convert_to_float<{{input_t}}>(b);
+    {%- elif input2_dtype == torch.int8 %}
+                auto b32 = at::vec::convert_to_int32<int8_t>(B + (sN * n + i) * ldb + k * VLEN, e_k);
+                vb[i] = at::vec::convert<float>(b32);
+    {%- else %}
+                vb[i] = Vectorized::loadu(B + (sN * n + i) * ldb + k * VLEN, e_k);
+    {%- endif %}
+            }
+            {{kernel.unroll_pragma(sub_block_m)}}
+            for (int s = 0; s < e_m; s++) {
+    {%- if input2_dtype in [torch.bfloat16, torch.float16] %}
+                auto a = VectorizedIn::loadu(A + (sM * m + s) * lda + k * VLEN, e_k);
+                std::tie(va[0], va[1]) = at::vec::convert_to_float<{{input_t}}>(a);
+    {%- elif input2_dtype == torch.int8 %}
+                auto a32 = at::vec::convert_to_int32<int8_t>(A + (sM * m + s) * lda + k * VLEN, e_k);
+                va = at::vec::convert<float>(a32);
+    {%- else %}
+                va = Vectorized::loadu(A + (sM * m + s) * lda + k * VLEN, e_k);
+    {%- endif %}
+    {%- if alpha != 1 %}
+                va = va * Vectorized({{alpha}});
+    {%- endif %}
+                if (k == 0) {
+                    {{kernel.unroll_pragma(sub_block_n)}}
+                    for (int i = 0; i < e_n; i++) {
+    {%- if input2_dtype in [torch.bfloat16, torch.float16] %}
+                        vmid[sN * s + i] = at::vec::fmadd(va[0], vb[2 * i], Vectorized(0.0f));
+                        vmid[sN * s + i] = at::vec::fmadd(va[1], vb[2 * i + 1], vmid[sN * s + i]);
+    {%- else %}
+                        vmid[sN * s + i] = at::vec::fmadd(va, vb[i], Vectorized(0.0f));
+    {%- endif %}
+                    }
+                } else {
+                    {{kernel.unroll_pragma(sub_block_n)}}
+                    for (int i = 0; i < e_n; i++) {
+    {%- if input2_dtype in [torch.bfloat16, torch.float16] %}
+                        vmid[sN * s + i] = at::vec::fmadd(va[0], vb[2 * i], vmid[sN * s + i]);
+                        vmid[sN * s + i] = at::vec::fmadd(va[1], vb[2 * i + 1], vmid[sN * s + i]);
+    {%- else %}
+                        vmid[sN * s + i] = at::vec::fmadd(va, vb[i], vmid[sN * s + i]);
+    {%- endif %}
+                    }
+                }
+            }
+            // store to C
+            if (k == _K - 1) {
+                {{kernel.unroll_pragma(sub_block_m)}}
+                for (int s = 0; s < e_m; s++) {
+                    {{kernel.unroll_pragma(sub_block_n)}}
+                    for (int i = 0; i < e_n; i++) {
+                        auto v = at::vec::vec_reduce_all([](Vectorized& x, Vectorized& y) { return x + y; }, vmid[sN * s + i]);
+                        if constexpr (accum) {
+                            auto c = *(C + (sM * m + s) * ldc + sN * n + i);
+                            *(C + (sM * m + s) * ldc + sN * n + i) = c + v;
+                        } else {
+                            *(C + (sM * m + s) * ldc + sN * n + i) = v;
+                        }
+                    }
+                }
+            }
+        };
+        for (int n = 0; n < bN; ++n) {
+            for (int m = 0; m < bM; ++m) {
+                for (int k = 0; k < _K; ++k) {
+                    compute_trans(m, n, k);
+                }
+            }
+        }
+    }
+{%- endif %}
+}
+"""
+    # set trans_b to generate gemm that supports transposed B matrix
+    # set tail_n to support the tail of N
+    # TODO add trans_b support for other micro gemms
+    # and move setting of trans_b to the init of CppMicroGemm
+    def __init__(
+        self,
+        name,
+        input_dtype,
+        input2_dtype,
+        output_dtype,
+        compute_dtype,
+        register_blocking,
+        alpha=1,
+        tail_n=False,
+        trans_b=False,
+    ) -> None:
+        super().__init__(
+            name,
+            input_dtype,
+            input2_dtype,
+            output_dtype,
+            compute_dtype,
+            register_blocking,
+            alpha,
+        )
+        self.tail_n = tail_n
+        # trans_b is only supported on platforms that
+        # support avx512 or avx2 since transpose_block is
+        # only implemented on these platforms
+        if trans_b:
+            vec_isa = pick_vec_isa()
+            assert issubclass(vec_isa.__class__, VecAVX512) or issubclass(
+                vec_isa.__class__, VecAVX2
+            )
+        self.trans_b = trans_b
+    def codegen_define(self, kernel: CppTemplateKernel) -> str:
+        options = {
+            "declare_kernel": self.get_kernel_declaration(),
+            "kernel": kernel,
+            "block_m": self.register_blocking.block_m,
+            "block_n": self.register_blocking.block_n,
+            "block_k": self.register_blocking.block_k,
+            "trans_b": False,
+            "tail_n": False,
+            "restrict_keyword": get_restrict_keyword(),
+            **self.get_common_options(),
+        }
+        if self.trans_b:
+            # TODO supports tuning of sub_block_m/sub_block_n
+            # to get better performance for specific shapes
+            sub_block_m = min(1, self.register_blocking.block_m)
+            sub_block_n = min(4, self.register_blocking.block_n)
+            # update options to generate kernel with trans_b and sub-block size
+            options.update(
+                {
+                    "trans_b": self.trans_b,
+                    "sub_block_m": sub_block_m,
+                    "sub_block_n": sub_block_n,
+                }
+            )
+        result = KernelTemplate._template_from_string(self.TEMPLATE_KERNEL).render(
+            options
+        )
+        # update options to generate the kernel for the tail of N
+        if self.tail_n:
+            options.update(
+                {
+                    "tail_n": self.tail_n,
+                }
+            )
+            result += KernelTemplate._template_from_string(self.TEMPLATE_KERNEL).render(
+                options
+            )
+        result += KernelTemplate._template_from_string(self.TEMPLATE_ENTRY).render(
+            options
+        )
+        return result
+# extra check for CppMicroGemmAMX
+def check_amx_extra(config, m, n, k, alpha, num_threads, **kwargs):
+    vnni_size = 4 if config.input_dtype in [torch.uint8, torch.int8] else 2
+    return k % vnni_size == 0 and alpha == 1
+@register_micro_gemm(
+    *generate_gemm_config(
+        VecAMX,
+        [(32, 32, 64), (48, 16, 64)],
+        input_dtype=torch.int8,
+        input2_dtype=torch.int8,
+        output_dtype=torch.int32,
+        compute_dtype=torch.int32,
+        extra_check=check_amx_extra,
+    ),
+    *generate_gemm_config(
+        VecAMX,
+        [(32, 32, 32), (48, 16, 32), (16, 48, 32)],
+        input_dtype=torch.bfloat16,
+        input2_dtype=torch.int8,
+        output_dtype=torch.float,
+        compute_dtype=torch.float,
+        extra_check=check_amx_extra,
+    ),
+    *generate_gemm_config(
+        VecAMX,
+        [(32, 32, 32), (48, 16, 32), (16, 48, 32)],
+        input_dtype=torch.bfloat16,
+        output_dtype=torch.float,
+        extra_check=check_amx_extra,
+    ),
+    *generate_gemm_config(
+        VecAMX,
+        [(32, 32, 64), (48, 16, 64)],
+        input_dtype=torch.uint8,
+        input2_dtype=torch.int8,
+        output_dtype=torch.int32,
+        compute_dtype=torch.int32,
+        extra_check=check_amx_extra,
+    ),
+)
+class CppMicroGemmAMX(CppMicroGemm):
+    """
+    This class generates the code for micro gemm using Advanced Matrix extension (AMX)
+    instructions available in 4th generation Intel Xeon for compute.
+    It supports input types of torch.bfloat16 with fp32 output.
+    """
+    TEMPLATE_ENTRY = r"""
+{{declare_kernel}} {
+    {{kernel.assert_function}}(N % {{block_n}} == 0, "N dimension must be multiple of {{block_n}}");
+    {{kernel.assert_function}}(K % 2 == 0, "K dimension must be multiple of 2");
+{%- if pack_vnni_B_locally %}
+    {{template.codegen_allocate_weight_buffer("packed_B_buf", input2_t, "K", block_n)}}
+{%- endif %}
+{%- if use_cached_dequantized_B %}
+    // Create a stack-allocated buffer for tiles of B.
+    // Except maybe for the tail-case, an AMX tile of B has 16x32 BF16 elements.
+    // we cache K * {{block_n}} elements of dequantized B
+    {{template.codegen_allocate_weight_buffer("dequantized_B_buf", input_t, "K", block_n)}}
+    const auto buf_size = K * {{block_n}};
+    auto load_dequantized_B = [&](int base_idx) {
+        // Load a tile of B & cache it in L1D.
+        {{input2_t}}* base_addr = const_cast<{{input2_t}}*>(B) + base_idx;
+        for (int idx_dq = 0, idx_q = 0; idx_dq < buf_size; idx_q += ldb, idx_dq += {{block_n}}) {
+        {%- for vec_idx in range(0, block_n, 32) %}
+            {%- if (block_n - vec_idx) >= 32 %}
+            auto b_int8_idx_{{vec_idx}} = at::vec::Vectorized<int8_t>::loadu(
+                base_addr + idx_q + {{vec_idx}} ,
+                static_cast<int64_t>(32)
+            );
+            auto b_bf16_idx_{{vec_idx}} = at::vec::convert<{{input_t}}>(b_int8_idx_{{vec_idx}});
+            b_bf16_idx_{{vec_idx}}.store(dequantized_B_buf + idx_dq + {{vec_idx}});
+            {%- else %}
+            auto b_int8_tail = at::vec::Vectorized<int8_t>::loadu(
+                base_addr + idx_q + {{block_n - (block_n % 32)}},
+                static_cast<int64_t>({{block_n % 32}})
+            );
+            auto b_bf16_tail = at::vec::convert<{{input_t}}>(b_int8_tail);
+            b_bf16_tail.store(
+                dequantized_B_buf + idx_dq + {{block_n - (block_n % 32)}},
+                static_cast<int64_t>({{block_n % 32}})
+            );
+            {%- endif %}
+        {%- endfor %}
+        }
+    };
+{%- endif %}
+// The ldb would not be block_n if N != block_n
+{%- if use_cached_dequantized_B or pack_vnni_B_locally %}
+    const int64_t updated_ldb = {{block_n}};
+{%- else %}
+    const int64_t updated_ldb = ldb;
+{%- endif %}
+    // TODO(jgong5): loop unroll for M and N
+    for (int64_t n = 0; n < N; n += {{block_n}}) {
+{%- if pack_vnni_B_locally %}
+        // Pack non-constant weights into VNNI interleaved format in packed_B_buf
+        at::vec::pack_vnni2(B + n, packed_B_buf, ldb, K, {{block_n}});
+{%- elif use_cached_dequantized_B %}
+        // Dequantize K * block_n int8 B elements into BF16
+        load_dequantized_B(n);
+{%- endif %}
+        for (int64_t m = 0; m < M; m += {{block_m}}) {
+            int64_t block_m = std::min<int64_t>(M - m, {{block_m}});
+            int64_t m_tail = m;
+{%- for num_rows in range(block_m, 0, -16) %}
+    {%- if num_rows != block_m %}
+            else
+    {%- endif %}
+            if (block_m >= {{num_rows}}) {
+                {{kernel_name}}_amx_kernel_{{num_rows}}_{{num_columns}}<accum>(
+                    amx_state,
+                    A + m * lda,
+{%- if use_cached_dequantized_B %}
+                    dequantized_B_buf,
+{%- elif pack_vnni_B_locally %}
+                    packed_B_buf,
+{%- else %}
+                    B + n,
+{%- endif %}
+                    C + m * ldc + n,
+                    K,
+                    lda,
+                    updated_ldb,
+                    ldc,
+                    16
+                );
+                block_m -= {{num_rows}};
+                m_tail += {{num_rows}};
+            }
+{%- endfor %}
+            if (block_m > 0) {
+                {{kernel_name}}_amx_kernel_16_{{num_columns}}<accum>(
+                    amx_state,
+                    A + m_tail * lda,
+{%- if use_cached_dequantized_B %}
+                    dequantized_B_buf,
+{%- elif pack_vnni_B_locally %}
+                    packed_B_buf,
+{%- else %}
+                    B + n,
+{%- endif %}
+                    C + m_tail * ldc + n,
+                    K,
+                    lda,
+                    updated_ldb,
+                    ldc,
+                    block_m
+                );
+            }
+        }
+    }
+}
+"""
+    TEMPLATE_KERNEL = r"""
+template <bool accum, bool prefetch=false>
+inline void {{kernel_name}}_amx_kernel_{{num_rows}}_{{num_columns}}(
+    AMXState& amx_state,
+    const {{input_t}}* {{restrict_keyword}} A,
+{%- if use_cached_dequantized_B %}
+    const {{input_t}}* {{restrict_keyword}} B,
+{%- else %}
+    const {{input2_t}}* {{restrict_keyword}} B,
+{%- endif %}
+    {{output_t}}* {{restrict_keyword}} C,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    uint8_t tilecfg_rows
+) {
+    // TODO(jgong5): add prefetch hint for A, B, C
+    auto loadconfig = [](const amx_tilecfg& cfg) {
+        _tile_loadconfig(&cfg);
+    };
+    const auto last_k_offset = K / {{block_k}} * {{block_k}};
+    const auto tail_k_size = K - last_k_offset;
+    if C10_LIKELY (last_k_offset > 0) {
+        amx_state.configure(tilecfg_rows, 64, {{num_rows}} / 16, {{num_columns}}, loadconfig);
+    } else {
+        amx_state.configure(tilecfg_rows, tail_k_size * sizeof({{input_t}}), {{num_rows}} / 16, {{num_columns}}, loadconfig);
+    }
+    auto load_c = [&]() {
+{%- for tile_row in range(num_rows // 16) %}
+    {%- for tile_col in range(num_columns) %}
+        {%- set tile_idx = tile_row * num_columns + tile_col %}
+        _tile_loadd({{tile_idx}}, C + {{tile_row * 16}} * ldc + {{tile_col * 16}}, ldc * sizeof({{output_t}}));
+    {%- endfor %}
+{%- endfor %}
+    };
+    auto zero_c = [&]() {
+{%- for tile_row in range(num_rows // 16) %}
+    {%- for tile_col in range(num_columns) %}
+        {%- set tile_idx = tile_row * num_columns + tile_col %}
+        _tile_zero({{tile_idx}});
+    {%- endfor %}
+{%- endfor %}
+    };
+    if constexpr (accum) {
+        load_c();
+    } else {
+        zero_c();
+    }
+    auto compute = [&](int k) {
+{%- set tile_offset_a = num_rows // 16 * num_columns %}
+{%- set tile_offset_b = tile_offset_a + num_rows // 16 %}
+{%- for tile_row in range(num_rows // 16) %}
+    {%- for tile_col in range(num_columns) %}
+        {%- set tile_idx_a = tile_offset_a + tile_row %}
+        {%- set tile_idx_b = tile_offset_b + tile_col %}
+        {%- set tile_idx_c = tile_row * num_columns + tile_col %}
+        {%- if tile_col == 0 %}
+        _tile_stream_loadd({{tile_idx_a}}, A + {{tile_row * 16}} * lda + k, lda * sizeof({{input_t}}));
+        {%- endif %}
+        {%- if tile_row == 0 %}
+        _tile_loadd({{tile_idx_b}}, B + k * ldb + {{tile_col * 16 * vnni_size}}, ldb * {{vnni_size}} * sizeof({{input_t}}));
+        {%- endif %}
+        {%- if int8_gemm %}
+            {%- if input_dtype == torch.int8 %}
+        _tile_dpbssd({{tile_idx_c}}, {{tile_idx_a}}, {{tile_idx_b}});
+            {%- else %}
+        _tile_dpbusd({{tile_idx_c}}, {{tile_idx_a}}, {{tile_idx_b}});
+            {%- endif %}
+        {%- else %}
+        _tile_dpbf16ps({{tile_idx_c}}, {{tile_idx_a}}, {{tile_idx_b}});
+        {%- endif %}
+    {%- endfor %}
+{%- endfor %}
+    };
+    {{kernel.unroll_pragma(4)}}
+    for (int k = 0; k < last_k_offset; k += {{block_k}}) {
+        compute(k);
+    }
+    auto store_c = [&]() {
+    // store to C
+{%- for tile_row in range(num_rows // 16) %}
+    {%- for tile_col in range(num_columns) %}
+        {%- set tile_idx = tile_row * num_columns + tile_col %}
+        _tile_stored({{tile_idx}}, C + {{tile_row * 16}} * ldc + {{tile_col * 16}}, ldc * sizeof({{output_t}}));
+    {%- endfor %}
+{%- endfor %}
+    };
+    // TODO(jgong5): move tail k computation to separate loopnest to save tile configuration overhead
+    if C10_UNLIKELY (tail_k_size > 0) {
+        if C10_LIKELY (last_k_offset > 0) {
+            store_c();
+            amx_state.configure(tilecfg_rows, tail_k_size * sizeof({{input_t}}), {{num_rows}} / 16, {{num_columns}}, loadconfig);
+            load_c();
+        }
+        compute(last_k_offset);
+    }
+    store_c();
+}
+"""
+    def codegen_define(self, kernel: CppTemplateKernel) -> str:
+        block_m, block_n, block_k = self.register_blocking
+        assert block_m % 16 == 0, "Only support block_m % 16 == 0 for AMX"
+        assert block_n % 16 == 0, "Only support block_n % 16 == 0 for AMX"
+        if self.input_dtype in [torch.uint8, torch.int8]:
+            assert block_k == 64, "Only support block_k = 64 for AMX INT8"
+        else:
+            assert block_k == 32, "Only support block_k = 32 for AMX Bfloat16/Float16"
+        num_columns = block_n // 16
+        options = {
+            "declare_kernel": self.get_kernel_declaration(),
+            "use_cached_dequantized_B": (
+                self.input_dtype == torch.bfloat16
+                and self.input2_dtype in [torch.int8, torch.uint8]
+            ),
+            "kernel": kernel,
+            "block_m": block_m,
+            "block_n": block_n,
+            "block_k": block_k,
+            "num_columns": num_columns,
+            "restrict_keyword": get_restrict_keyword(),
+            **self.get_common_options(),
+        }
+        result = ""
+        for num_rows in range(block_m, 0, -16):
+            amx_kernel_options = {**options, "num_rows": num_rows}
+            result += KernelTemplate._template_from_string(self.TEMPLATE_KERNEL).render(
+                amx_kernel_options
+            )
+        result += KernelTemplate._template_from_string(self.TEMPLATE_ENTRY).render(
+            options
+        )
+        return result
+    def codegen_init(
+        self,
+        kernel: CppTemplateKernel,
+    ) -> str:
+        return "AMXState amx_state;"
+    def codegen_finalize(
+        self,
+        kernel: CppTemplateKernel,
+    ) -> str:
+        return "amx_state.release([]() { _tile_release(); });"
+    def get_kernel_extra_args_declare(self) -> str:
+        return "AMXState& amx_state,"
+    def get_kernel_extra_args(self, **kwargs) -> list[str]:
+        return ["amx_state,"]
+    def get_b_layout(self):
+        if self.input_dtype in [torch.uint8, torch.int8]:
+            return LayoutType.VNNI4
+        else:
+            return LayoutType.VNNI2
+# extra check for CppMicroBrgemm
+def check_brgemm_extra(config, m, n, k, alpha, num_threads, **kwargs):
+    assert config.input_dtype == torch.half and config.output_dtype == torch.float
+    vnni_size = 2
+    # use brgemm for Half when amx_fp16 is supported
+    return torch.cpu._is_amx_fp16_supported() and k % vnni_size == 0 and alpha == 1
+@register_micro_gemm(
+    *generate_gemm_config(
+        VecAMX,
+        [(32, 32, 32), (48, 16, 32), (16, 48, 32)],
+        input_dtype=torch.half,
+        output_dtype=torch.float,
+        extra_check=check_brgemm_extra,
+    ),
+)
+class CppMicroBrgemm(CppMicroGemm):
+    """
+    This class generates the code for micro gemm using oneDNN brgemm.
+    It supports input types of torch.half.
+    """
+    TEMPLATE_ENTRY = r"""
+#include <ATen/native/CPUBlas.h>
+{{declare_kernel}} {
+{%- if pack_vnni_B_locally %}
+    {{template.codegen_allocate_weight_buffer("packed_B_buf", input2_t, "K * N")}}
+    at::vec::pack_vnni2(B, packed_B_buf, ldb, K, N);
+{%- endif %}
+    at::native::cpublas::brgemm(
+      M, N, K,
+    {%- if pack_vnni_B_locally %}
+      lda, N, ldc,
+    {%- else %}
+      lda, ldb, ldc,
+    {%- endif %}
+      accum,
+      A,
+    {%- if pack_vnni_B_locally %}
+      packed_B_buf,
+    {%- else %}
+      B,
+    {%- endif %}
+      C);
+}
+"""
+    def codegen_define(self, kernel: CppTemplateKernel) -> str:
+        options = {
+            "declare_kernel": self.get_kernel_declaration(),
+            "kernel": kernel,
+            "block_m": self.register_blocking.block_m,
+            "block_n": self.register_blocking.block_n,
+            "block_k": self.register_blocking.block_k,
+            "restrict_keyword": get_restrict_keyword(),
+            **self.get_common_options(),
+        }
+        result = ""
+        result += KernelTemplate._template_from_string(self.TEMPLATE_ENTRY).render(
+            options
+        )
+        return result
+    def codegen_finalize(
+        self,
+        kernel: CppTemplateKernel,
+    ) -> str:
+        return "at::native::cpublas::brgemm_release();"
+    def get_b_layout(self):
+        assert self.input_dtype == torch.half and torch.cpu._is_amx_fp16_supported()
+        return LayoutType.VNNI2
+def check_woq_int4_extra(config, m, n, k, alpha, num_threads, **kwargs):
+    if alpha != 1:
+        return False
+    q_group_size = kwargs.get("q_group_size", None)
+    assert q_group_size is not None
+    if (
+        q_group_size < 32
+        or k % q_group_size != 0
+        or config.register_blocking.block_k > q_group_size
+    ):
+        return False
+    return k % config.register_blocking.block_k == 0 and n % 64 == 0
+@register_micro_gemm(
+    # TODO: support float/half input
+    *generate_gemm_config(
+        VecAVX512,
+        [(4, 64, 32), (4, 64, 64), (4, 64, 128)],
+        input_dtype=torch.bfloat16,
+        input2_dtype=torch.uint8,
+        output_dtype=torch.float,
+        compute_dtype=torch.float,
+        extra_check=check_woq_int4_extra,
+    ),
+)
+class CppMicroGemmWoQInt4Avx512(CppMicroGemmFP32Vec):
+    """
+    This class generates the code for WoQ int4 micro gemm using AVX512 intrinsics.
+    It is based on the corresponding ATen kernel.
+    Shape of packed weight = [N // 64, K, 32], viewed as [N, K // 2]
+    Shape of packed ScalesAndZeros = [K // group_size, N, 2]
+    """
+    TEMPLATE_ENTRY = r"""
+{{declare_kernel}} {
+    {{kernel.assert_function}}(N % {{block_n}} == 0, "N dimension must be multiple of {{block_n}}");
+    {{kernel.assert_function}}(K % {{block_k}} == 0, "K dimension must be multiple of {{block_k}}");
+    auto group_size = q_group_size;
+    for (int64_t m = 0; m < M; m += {{block_m}}) {
+        int64_t block_m = std::min<int64_t>(M - m, {{block_m}});
+        for (int64_t n = 0; n < N; n += {{block_n}}) {
+            if (block_m == {{block_m}}) {
+                {{kernel_name}}_kernel<{{block_m}}, {{block_n}}, accum>(
+                    A + m * lda,
+                    reinterpret_cast<const uint8_t*>(B) + n * ldb,
+                    C + m * ldc + n,
+                    K,
+                    lda,
+                    /* ldb */ {{block_n}} / 2,
+                    ldc,
+                    group_size,
+                    ScaleAndZeros + n * 2,
+                    lds,
+                    k_start
+                );
+            } else {
+                switch (block_m) {
+                {%- for b in range(block_m - 1, 0, -1) %}
+                case {{b}}:
+                    {{kernel_name}}_kernel<{{b}}, {{block_n}}, accum>(
+                        A + m * lda,
+                        reinterpret_cast<const uint8_t*>(B) + n * ldb,
+                        C + m * ldc + n,
+                        K,
+                        lda,
+                        /* ldb */ {{block_n}} / 2,
+                        ldc,
+                        group_size,
+                        ScaleAndZeros + n * 2,
+                        lds,
+                        k_start
+                    );
+                    break;
+                {%- endfor %}
+                default:
+                    {{kernel.assert_function}}(false, "Unsupported block_m: ", block_m);
+                }
+            }
+        }
+    }
+}
+"""
+    TEMPLATE_KERNEL = r"""
+inline bool {{kernel_name}}_is_block_start(int index, int k_start, int group_size) {
+  return (k_start + index) % group_size == 0;
+}
+inline __m128i {{kernel_name}}_convert_int4_to_int8(const uint8_t* data) {
+  __m128i tmp = _mm_loadu_si64((const __m128i*)data);
+  __m128i bytes = _mm_cvtepu8_epi16(tmp);
+  const __m128i lowMask = _mm_set1_epi8(0xF);
+  __m128i high = _mm_andnot_si128(lowMask, bytes);
+  __m128i low = _mm_and_si128(lowMask, bytes);
+  high = _mm_slli_epi16(high, 4);
+  bytes = _mm_or_si128(low, high);
+  return bytes;
+}
+template <int64_t BLOCK_M, int64_t BLOCK_N, bool accum>
+inline void {{kernel_name}}_kernel(
+    const {{input_t}}* {{restrict_keyword}} A,
+    const uint8_t* {{restrict_keyword}} B,
+    {{output_t}}* {{restrict_keyword}} C,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    int64_t q_group_size,
+    const at::BFloat16* {{restrict_keyword}} ScaleAndZeros,
+    int64_t lds, // leading dimension of ScaleAndZeros
+    int64_t k_start) {
+  constexpr int BLOCK_K = {{block_k}};
+  constexpr int ROWS = BLOCK_M;
+  constexpr int COLS = BLOCK_N / 16;
+  const int PREFETCH_SIZE_K = 16 * 4;
+  const int PREFETCH_SIZE_KB = (PREFETCH_SIZE_K + BLOCK_K - 1) / BLOCK_K;
+  // number of blocks on K
+  const int KB = K / BLOCK_K;
+  __m512 va;
+  __m512 vb[COLS];
+  __m512 vc[ROWS * COLS];
+  __m512 scale[COLS];
+  __m512 zero[COLS];
+  // Lookup table to de-quantize int4 values to bf16.
+  // Values are dequantized as truly int4 [-8, 7] range;
+  //
+  // dequant = (bf16(int4_value) * bf16_scale) + bf16_zero
+  //
+  static const __m512 lut = _mm512_set_ps(
+      7.0f, 6.0f, 5.0f, 4.0f,
+      3.0f, 2.0f, 1.0f, 0.0f,
+      -1.0f, -2.0f, -3.0f, -4.0f,
+      -5.0f, -6.0f, -7.0f, -8.0f);
+  // index for transpose
+  static const __m512i idx1 = _mm512_set_epi32(
+      30, 28, 26, 24, 22, 20, 18, 16,
+      14, 12, 10, 8, 6, 4, 2, 0);
+  static const __m512i idx2 = _mm512_set_epi32(
+      31, 29, 27, 25, 23, 21, 19, 17,
+      15, 13, 11, 9, 7, 5, 3, 1);
+  // load scale and zero point
+  auto load_scale_and_zeros = [&](int i, int _kb) {
+    // load 2x bfloat16 vector
+    __m512i t = _mm512_loadu_si512((__m512i*)(ScaleAndZeros + _kb * lds + 32 * i));
+    if (_kb + PREFETCH_SIZE_KB < KB) {
+      _mm_prefetch(ScaleAndZeros + (_kb + PREFETCH_SIZE_KB) * lds + 32 * i, _MM_HINT_T0);
+    }
+    // convert to 2x f32 vector
+    __m512 a, b;
+    at::vec::cvtbf16_fp32(t, a, b);
+    // transpose scale_and_zero from {16, 2} to {2, 16}
+    // inputs:
+    //   a: {s0, z0, s1, z1, ..., s7, z7}
+    //   b: {s8, z8, s9, z9, ..., s15, z15}
+    // output:
+    //   scale: {s0, s1, s2, ..., s15}
+    //   zero:  {z0, z1, z2, ..., z15}
+    scale[i] = _mm512_mask_permutex2var_ps(a, 0xffff, idx1, b);
+    zero[i] = _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b);
+  };
+  auto loadc = [&](auto i) {
+    if constexpr (accum) {
+       constexpr int row = i / COLS;
+       constexpr int col = i % COLS;
+       vc[i] = _mm512_loadu_ps(C + row * ldc + col * 16);
+    } else {
+      vc[i] = _mm512_setzero_ps();
+    }
+  };
+  c10::ForcedUnroll<ROWS * COLS>{}(loadc);
+  auto compute = [&, COLS](auto i, int k) {
+    constexpr  int row = i / COLS;
+    constexpr  int col = i % COLS;
+    if constexpr (col == 0) {
+      float aa = static_cast<float>(A[row * lda + k]);
+      if (k + PREFETCH_SIZE_K < K) {
+        _mm_prefetch(A + row * lda + k + PREFETCH_SIZE_K, _MM_HINT_T0);
+      }
+      va = _mm512_set1_ps(aa);
+    }
+    if constexpr (row == 0) {
+      if constexpr (COLS == 4) {
+        // when BLOCK_N = 64, handle each row at a time
+        // to reduce de-quantize overhead.
+        if constexpr (col == 0) {
+          __m256i b4 = _mm256_loadu_si256((__m256i*)(B + k * ldb));
+          if (k + PREFETCH_SIZE_K < K) {
+            _mm_prefetch(B + (k + PREFETCH_SIZE_K) * ldb, _MM_HINT_T0);
+          }
+          __m512i b32 = _mm512_cvtepu8_epi32(_mm256_castsi256_si128(b4));
+          vb[0] = _mm512_permutexvar_ps(b32, lut);
+          vb[0] = _mm512_fmadd_ps(vb[0], scale[0], zero[0]);
+          vb[2] = _mm512_permutexvar_ps(_mm512_srli_epi32(b32, 4), lut);
+          vb[2] = _mm512_fmadd_ps(vb[2], scale[2], zero[2]);
+          b32 = _mm512_cvtepu8_epi32(_mm256_extracti128_si256(b4, 1));
+          vb[1] = _mm512_permutexvar_ps(b32, lut);
+          vb[1] = _mm512_fmadd_ps(vb[1], scale[1], zero[1]);
+          vb[3] = _mm512_permutexvar_ps(_mm512_srli_epi32(b32, 4), lut);
+          vb[3] = _mm512_fmadd_ps(vb[3], scale[3], zero[3]);
+        }
+      } else {
+        __m128i b8 = {{kernel_name}}_convert_int4_to_int8(B + k * ldb + col * 8);
+        __m512i b32 = _mm512_cvtepu8_epi32(b8);
+        vb[col] = _mm512_permutexvar_ps(b32, lut);
+        vb[col] = _mm512_fmadd_ps(vb[col], scale[col], zero[col]);
+      }
+    }
+    constexpr int idx = row * COLS + col;
+    vc[idx] = _mm512_fmadd_ps(va, vb[col], vc[idx]);
+  };
+  for (int k = 0, kb = 0; k < K; ++k) {
+    if ({{kernel_name}}_is_block_start(k, k_start, q_group_size)) {
+      c10::ForcedUnroll<COLS>{}(load_scale_and_zeros, kb++);
+    }
+    c10::ForcedUnroll<ROWS * COLS>{}(compute, k);
+  }
+  //store to C
+  auto storec = [&, COLS](auto i) {
+    constexpr int row = i / COLS;
+    constexpr int col = i % COLS;
+    _mm512_storeu_ps(C + row * ldc + col * 16, vc[i]);
+  };
+  c10::ForcedUnroll<ROWS * COLS>{}(storec);
+}
+"""
+    def get_kernel_extra_args_declare(self) -> str:
+        return (
+            "const int64_t q_group_size,\n"
+            "    const at::BFloat16* __restrict__ ScaleAndZeros,\n"
+            "    const int64_t lds,\n"
+            "    int64_t k_start,"
+        )
+    def get_kernel_extra_args(self, **kwargs) -> list[str]:
+        assert "kernel" in kwargs
+        assert "qscale_and_zeros" in kwargs
+        kernel = kwargs["kernel"]
+        qscale_and_zeros = kwargs["qscale_and_zeros"]
+        return [
+            "group_size,",
+            f"&({kernel.index(qscale_and_zeros, [0, 0, 0])}),",
+            "N * 2,",  # lds
+            "k_start,",
+        ]
+    def is_woq_int4(self):
+        return True
+@register_micro_gemm(
+    *generate_gemm_config(
+        VecAMX,
+        [  # (block_m, block_n, block_k)
+            (16, 32, 32),
+            (32, 32, 32),
+        ],
+        input_dtype=torch.bfloat16,
+        input2_dtype=torch.uint8,
+        output_dtype=torch.float,
+        compute_dtype=torch.float,
+        extra_check=check_amx_extra,
+    ),
+)
+class CppMicroGemmWoQInt4Amx(CppMicroGemmAMX):
+    """
+    This class generates the code for WoQ int4 micro gemm using AMX intrinsics,
+    which are available on 4th and newer generations of Intel Xeon.
+    Shape of packed weight = [N // 32, K, 16], viewed as [N, K // 2]
+    Shape of packed ScalesAndZeros = [K // group_size, N, 2]
+    Reuse TEMPLATE_KERNEL of CppMicroGemmAMX.
+    """
+    TEMPLATE_ENTRY = r"""
+inline bool {{kernel_name}}_is_block_start(int index, int k_start, int group_size) {
+  return (k_start + index) % group_size == 0;
+}
+{{declare_kernel}} {
+    {{kernel.assert_function}}(N % {{block_n}} == 0, "N dimension must be multiple of {{block_n}}");
+    {{kernel.assert_function}}(K % 2 == 0, "K dimension must be multiple of 2");
+    {{kernel.assert_function}}({{block_n}} == 32, "block_n must be 32 for WOQ int4");
+    // Create a stack-allocated buffer for tiles of B.
+    // Except maybe for the tail-case, an AMX tile of B has 16x32 BF16 elements.
+    // we cache K * {{block_n}} elements of dequantized B
+    {{template.codegen_allocate_weight_buffer("dequantized_B_buf", input_t, "K", block_n)}}
+    constexpr int BLOCK_K = {{block_k}};
+    constexpr int64_t BLOCK_N = {{block_n}};
+    constexpr int COLS = BLOCK_N / 16;
+    const int PREFETCH_SIZE_K = 16 * 4;
+    const int PREFETCH_SIZE_KB = (PREFETCH_SIZE_K + BLOCK_K - 1) / BLOCK_K;
+    const int KB = K / BLOCK_K;
+    __m512i b32[COLS * 2];
+    __m512 vb[COLS * 2];
+    __m512 scale[COLS];
+    __m512 zero[COLS];
+    // Lookup table to de-quantize int4 values to bf16.
+    // Values are dequantized as truly int4 [-8, 7] range;
+    //
+    // dequant = (bf16(int4_value) * bf16_scale) + bf16_zero
+    //
+    static const __m512 lut = _mm512_set_ps(
+        7.0f, 6.0f, 5.0f, 4.0f,
+        3.0f, 2.0f, 1.0f, 0.0f,
+        -1.0f, -2.0f, -3.0f, -4.0f,
+        -5.0f, -6.0f, -7.0f, -8.0f);
+    // index for transpose
+    static const __m512i idx1 = _mm512_set_epi32(
+        30, 28, 26, 24, 22, 20, 18, 16,
+        14, 12, 10, 8, 6, 4, 2, 0);
+    static const __m512i idx2 = _mm512_set_epi32(
+        31, 29, 27, 25, 23, 21, 19, 17,
+        15, 13, 11, 9, 7, 5, 3, 1);
+    // Indices for VNNI layout conversion
+    __m512i idx_low = _mm512_set_epi32(
+        0x17,
+        0x07,
+        0x16,
+        0x06,
+        0x15,
+        0x05,
+        0x14,
+        0x04,
+        0x13,
+        0x03,
+        0x12,
+        0x02,
+        0x11,
+        0x01,
+        0x10,
+        0x00);
+    __m512i idx_high = _mm512_set_epi32(
+        0x1f,
+        0x0f,
+        0x1e,
+        0x0e,
+        0x1d,
+        0x0d,
+        0x1c,
+        0x0c,
+        0x1b,
+        0x0b,
+        0x1a,
+        0x0a,
+        0x19,
+        0x09,
+        0x18,
+        0x08);
+    // load scale and zero point
+    auto load_scale_and_zeros = [&](int i, int _kb) {
+        // load 2x bfloat16 vector
+        __m512i t = _mm512_loadu_si512((__m512i*)(ScaleAndZeros + _kb * lds + 32 * i));
+        if (_kb + PREFETCH_SIZE_KB < KB) {
+            _mm_prefetch(ScaleAndZeros + (_kb + PREFETCH_SIZE_KB) * lds + 32 * i, _MM_HINT_T0);
+        }
+        // convert to 2x f32 vector
+        __m512 a, b;
+        at::vec::cvtbf16_fp32(t, a, b);
+        // transpose scale_and_zero from {16, 2} to {2, 16}
+        // inputs:
+        //   a: {s0, z0, s1, z1, ..., s7, z7}
+        //   b: {s8, z8, s9, z9, ..., s15, z15}
+        // output:
+        //   scale: {s0, s1, s2, ..., s15}
+        //   zero:  {z0, z1, z2, ..., z15}
+        scale[i] = _mm512_mask_permutex2var_ps(a, 0xffff, idx1, b);
+        zero[i] = _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b);
+    };
+    // Dequantize a B block of 2 * block_n into bf16
+    // So, it handles k and k+1 at the same time
+    auto dequantize_B = [&](int n) {
+        constexpr int64_t ldb_int4 = BLOCK_N / 2; // 16
+        for (int k = 0, kb = 0; k < K; k += 2) {
+            // Since block_k must be 32 for AMX microkernels, k_start may not be
+            // a multiple of q_group_size. In that case, we need to load scales
+            // and zero points immediately when k == 0 here
+            if ({{kernel_name}}_is_block_start(k, k_start, q_group_size) || k == 0) {
+                c10::ForcedUnroll<COLS>{}(load_scale_and_zeros, kb++);
+            }
+            // load 256 bits = 64 elements in int4
+            if (k + PREFETCH_SIZE_K < K) {
+                _mm_prefetch(B + (k + PREFETCH_SIZE_K) * ldb_int4, _MM_HINT_T0);
+            }
+            __m128i b4 = _mm_loadu_si128((__m128i*)(B + n / 2 * K + k * ldb_int4));
+            b32[0] = _mm512_cvtepu8_epi32(b4);
+            b32[1] = _mm512_srli_epi32(b32[0], 4);
+            vb[0] = _mm512_permutexvar_ps(b32[0] , lut);
+            vb[0] = _mm512_fmadd_ps(vb[0], scale[0], zero[0]);
+            vb[1] = _mm512_permutexvar_ps(b32[1], lut);
+            vb[1] = _mm512_fmadd_ps(vb[1], scale[1], zero[1]);
+            b4 = _mm_loadu_si128((__m128i*)(B + n / 2 * K + (k + 1) * ldb_int4));
+            b32[0 + COLS] = _mm512_cvtepu8_epi32(b4);
+            b32[1 + COLS] = _mm512_srli_epi32(b32[0 + COLS], 4);
+            vb[0 + COLS] = _mm512_permutexvar_ps(b32[0 + COLS] , lut);
+            vb[0 + COLS] = _mm512_fmadd_ps(vb[0 + COLS], scale[0], zero[0]);
+            vb[1 + COLS] = _mm512_permutexvar_ps(b32[1 + COLS], lut);
+            vb[1 + COLS] = _mm512_fmadd_ps(vb[1 + COLS], scale[1], zero[1]);
+            for (int i = 0; i < COLS; i++) {
+                // convert to VNNI
+                auto low = _mm512_permutex2var_ps(vb[i], idx_low, vb[i + COLS]);
+                auto high = _mm512_permutex2var_ps(vb[i], idx_high, vb[i + COLS]);
+                // convert lower 16 float32 values to bfloat16
+                auto v0_bf16 = reinterpret_cast<__m256i>(_mm512_cvtneps_pbh(low));
+                // convert higher 16 float32 values to bfloat16
+                auto v1_bf16 = reinterpret_cast<__m256i>(_mm512_cvtneps_pbh(high));
+                // combine the lower 16 and higher 16 bfloat16 values
+                auto v = _mm512_castsi256_si512(v0_bf16);
+                v = _mm512_inserti64x4(v, v1_bf16, 1);
+                // store the VNNI format bfloat16 values
+                {{input_t}}* addr = dequantized_B_buf + k * 32 + (i % 2) * 32;
+                _mm512_storeu_si512(addr, v);
+            }
+        }
+    };
+    for (int64_t n = 0; n < N; n += {{block_n}}) {
+        // Dequantize K * block_n int8 B elements into BF16
+        dequantize_B(n);
+        for (int64_t m = 0; m < M; m += {{block_m}}) {
+            int64_t block_m = std::min<int64_t>(M - m, {{block_m}});
+            int64_t m_tail = m;
+        {%- for num_rows in range(block_m, 0, -16) %}
+            {%- if num_rows != block_m %}
+            else
+        {%- endif %}
+            if (block_m >= {{num_rows}}) {
+                {{kernel_name}}_amx_kernel_{{num_rows}}_{{num_columns}}<accum>(
+                    amx_state,
+                    A + m * lda,
+                    dequantized_B_buf + n * K,
+                    C + m * ldc + n,
+                    K,
+                    lda,
+                    {{block_n}},
+                    ldc,
+                    16
+                );
+                block_m -= {{num_rows}};
+                m_tail += {{num_rows}};
+            }
+        {%- endfor %}
+            if (block_m > 0) {
+                {{kernel_name}}_amx_kernel_16_{{num_columns}}<accum>(
+                    amx_state,
+                    A + m_tail * lda,
+                    dequantized_B_buf + n * K,
+                    C + m_tail * ldc + n,
+                    K,
+                    lda,
+                    {{block_n}},
+                    ldc,
+                    block_m
+                );
+            }
+        } // for m
+    } // for n
+}
+"""
+    def get_kernel_extra_args_declare(self) -> str:
+        return (
+            "AMXState& amx_state,\n"
+            "    const int64_t q_group_size,\n"
+            "    const c10::BFloat16* __restrict__ ScaleAndZeros,\n"
+            "    const int64_t lds,\n"
+            "    int64_t k_start,"
+        )
+    def get_kernel_extra_args(self, **kwargs) -> list[str]:
+        assert "kernel" in kwargs
+        assert "qscale_and_zeros" in kwargs
+        kernel = kwargs["kernel"]
+        qscale_and_zeros = kwargs["qscale_and_zeros"]
+        return [
+            "amx_state,",
+            "group_size,",
+            f"&({kernel.index(qscale_and_zeros, [0, 0, 0])}),",
+            "N * 2,",  # lds
+            "k_start,",
+        ]
+    def is_woq_int4(self):
+        return True
+def create_micro_gemm(
+    name,
+    m,
+    n,
+    k,
+    input_dtype,
+    input2_dtype,
+    output_dtype=None,
+    compute_dtype=None,
+    alpha=1,
+    num_threads=-1,
+    use_ref=True,
+    q_group_size=None,
+) -> Optional[CppMicroGemm]:
+    """
+    Based on the provided info, try to find the config of the micro-kernel that would
+    deliver the best performance in terms of lower latency for this case.
+    """
+    def create_from_config(cls, config: CppMicroGemmConfig):
+        return cls(
+            name,
+            config.input_dtype,
+            config.input2_dtype,
+            config.output_dtype,
+            config.compute_dtype,
+            config.register_blocking,
+            alpha,
+        )
+    def skip_amx_kernel_for_woq(config, dynamic_M, micro_gemm_cls):
+        # For WoQ GEMM, AMX micro-kernel may not perform well if m is small.
+        # Exception: for dynamic shapes, we consider using the AMX micro-kernel.
+        if (
+            dynamic_M
+            or input_dtype != torch.bfloat16
+            or input2_dtype not in [torch.int8, torch.uint8]
+        ):
+            return False
+        # For WOQ INT8, use AMX for m >= block_m
+        # For WOQ INT4, use AMX for m >= 5
+        block_m, *_ = config.register_blocking
+        is_woq_int4 = micro_gemm_cls == CppMicroGemmWoQInt4Amx
+        m_threshold = 5 if is_woq_int4 else block_m
+        return m < m_threshold
+    assert isinstance(n, int) or n.is_number, n
+    assert isinstance(k, int) or k.is_number, k
+    from ..utils import has_free_symbols
+    dynamic_M = has_free_symbols((m,))
+    m = V.graph.sizevars.size_hint(m, fallback=1) if dynamic_M else m
+    assert isinstance(m, int) or m.is_number, m
+    if output_dtype is None:
+        output_dtype = input_dtype
+    if compute_dtype is None:
+        compute_dtype = output_dtype
+    if num_threads < 0:
+        num_threads = parallel_num_threads()
+    vec_isa = pick_vec_isa()
+    matched_configs = []
+    for cls, configs in micro_gemm_configs.items():
+        for config in configs:
+            if not issubclass(vec_isa.__class__, config.vec_isa_cls):
+                continue
+            if (
+                config.input_dtype == input_dtype
+                and config.compute_dtype == compute_dtype
+                and config.input2_dtype == input2_dtype
+                and config.output_dtype == output_dtype
+                # The output_dtype here is the output dtype of the micro-kernel.
+                # In some cases, the actual output dtype of the op for which the micro-kernel
+                # is being created would be same as that of the activation, but the micro-kernels
+                # compute output in Float/int32, which is converted in the GEMM template. This is
+                # subject to change in the future.
+            ):
+                if config.extra_check is not None and not config.extra_check(
+                    config,
+                    m,
+                    n,
+                    k,
+                    alpha,
+                    num_threads,
+                    dynamic_M=dynamic_M,
+                    q_group_size=q_group_size,
+                ):
+                    continue
+                block_m, block_n, block_k = config.register_blocking
+                if config.vec_isa_cls == VecAMX and skip_amx_kernel_for_woq(
+                    config, dynamic_M, cls
+                ):
+                    continue
+                # Criteria on the ranking of configurations
+                # 1. ISA: AMX > VEC
+                # 2. Dividable by block sizes (block_m, block_n, block_k)
+                # 3. Number of mxn blocks is large enough to occupy all the threads
+                # 4. Register blocks are larger
+                isa_score = 0
+                if config.vec_isa_cls == VecAMX:
+                    isa_score += 1
+                dividable_score = 0
+                if m % block_m == 0:
+                    dividable_score += 1
+                if n % block_n == 0:
+                    dividable_score += 1
+                if k % block_k == 0:
+                    dividable_score += 1
+                occupancy_score = 0
+                n_blocks = (n + block_n - 1) // block_n
+                total_mxn_blocks = n_blocks * ((m + block_m - 1) // block_m)
+                if n_blocks >= num_threads:
+                    occupancy_score += 1
+                if total_mxn_blocks >= num_threads:
+                    occupancy_score += 1
+                register_bytes = (
+                    block_m * block_n * config.compute_dtype.itemsize
+                    + (block_m * block_k + block_k * block_n)
+                    * config.input_dtype.itemsize
+                )
+                matched_configs.append(
+                    (
+                        (isa_score, dividable_score, occupancy_score, register_bytes),
+                        cls,
+                        config,
+                    )
+                )
+    if len(matched_configs) == 0:
+        if use_ref:
+            return CppMicroGemmRef(
+                name, input_dtype, input2_dtype, output_dtype, compute_dtype, alpha
+            )
+        else:
+            return None
+    # TODO(jgong5): allow autotuning on choices of configs
+    return create_from_config(*max(matched_configs, key=operator.itemgetter(0))[1:])

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_template.py ADDED Viewed

	@@ -0,0 +1,138 @@

+# mypy: allow-untyped-defs
+import ctypes
+import functools
+import itertools
+import logging
+import sys
+from collections.abc import Iterable
+from typing import Callable, Optional, Union
+from unittest.mock import patch
+import sympy
+from .. import config, ir
+from ..autotune_process import CppBenchmarkRequest, TensorMeta
+from ..utils import IndentedBuffer, Placeholder, unique
+from ..virtualized import V
+from .common import KernelTemplate
+from .cpp_template_kernel import CppTemplateCaller, CppTemplateKernel
+log = logging.getLogger(__name__)
+class CppTemplate(KernelTemplate):
+    index_counter = itertools.count()
+    def __init__(
+        self,
+        name: str,
+        input_nodes,
+        layout: ir.Layout,
+        num_threads: int,
+        epilogue_creator: Optional[Callable[[ir.Buffer], ir.Pointwise]] = None,
+    ) -> None:
+        super().__init__(name)
+        self.input_nodes = input_nodes
+        self.index = next(self.index_counter)
+        self.output_node: Union[ir.Buffer, list[ir.Buffer]] = ir.Buffer(
+            name=f"buf_out{self.index}", layout=layout
+        )
+        self.layout = layout
+        self.num_threads = num_threads
+        self.epilogue_creator = epilogue_creator
+    def generate(self, **kwargs):
+        kernel_name = f"cpp_{self.name}"
+        with (
+            patch.object(V.graph, "get_dtype", self._fake_get_dtype(self.output_node)),
+            patch.object(ir.FlexibleLayout, "allow_indexing", True),
+            V.graph.set_current_device(self.layout.device),
+            CppTemplateKernel(
+                kernel_name=kernel_name, num_threads=self.num_threads
+            ) as kernel,
+        ):
+            code = kernel.render(self, **kwargs)
+            _, call_args, _, _ = kernel.args.python_argdefs()
+            log.debug("Generated Code:\n%s", code)
+            log.debug(
+                "Args: cpp_argdefs: %s, python_argdefs: %s",
+                kernel.args.cpp_argdefs(),
+                kernel.args.python_argdefs(),
+            )
+        expected_args = list(
+            unique(input_node.get_name() for input_node in self.input_nodes)
+        )
+        if isinstance(self.output_node, Iterable):
+            expected_args.extend([node.get_name() for node in self.output_node])
+        else:
+            expected_args.extend([self.output_node.get_name()])
+        assert list(call_args)[: len(expected_args)] == expected_args, (
+            call_args,
+            expected_args,
+        )
+        extra_args = V.graph.sizevars.size_hints(
+            map(sympy.expand, call_args[len(expected_args) :])
+        )
+        # Cast the size hint from int to ctypes.c_ulonglong explicitly
+        # since in cpp kernel, we bind it to C long
+        extra_args = tuple(ctypes.c_ulonglong(x) for x in extra_args)
+        kernel_hash_name = f"cpp_{self.name}_{self.index}"
+        # Create the BenchmarkRequest for CPP
+        bmreq = CppBenchmarkRequest(
+            kernel_name=kernel_name,
+            input_tensor_meta=TensorMeta.from_irnodes(self.input_nodes),
+            output_tensor_meta=TensorMeta.from_irnodes(self.output_node),
+            extra_args=extra_args,
+            source_code=code,
+        )
+        def make_kernel_render(
+            template_node: ir.CppTemplateBuffer,
+            flag_template_buffer_has_other_users: bool,
+            epilogue_nodes: Optional[list[ir.IRNode]] = None,
+        ):
+            kernel = CppTemplateKernel(
+                kernel_name=str(Placeholder.KERNEL_NAME), num_threads=self.num_threads
+            )
+            render = functools.partial(
+                kernel.render,
+                self,
+                template_buffer_node=template_node,
+                flag_template_buffer_has_other_users=flag_template_buffer_has_other_users,
+                epilogue_nodes=epilogue_nodes,
+                **kwargs,
+            )
+            return kernel, render
+        return CppTemplateCaller(
+            kernel_hash_name,
+            self.name,
+            self.input_nodes,
+            self.output_node[0].get_layout()
+            if isinstance(self.output_node, Iterable)
+            else self.output_node.get_layout(),
+            make_kernel_render,
+            bmreq,
+            self,
+        )
+    def header(self) -> IndentedBuffer:
+        res = IndentedBuffer()
+        res.writeline("#include <torch/csrc/inductor/cpp_prefix.h>")
+        # TODO: add c10::ForcedUnroll test to test_aoti_abi_check
+        res.splice("""#include <c10/util/Unroll.h>""")
+        res.splice("""#include <torch/csrc/inductor/aoti_torch/c/shim.h>""")
+        enable_kernel_profile = config.cpp.enable_kernel_profile and sys.platform in [
+            "linux",
+            "win32",
+        ]
+        if enable_kernel_profile:
+            res.writelines(["#include <ATen/record_function.h>"])
+        return res
+    def render(self, **kwargs) -> str:
+        raise NotImplementedError

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_template_kernel.py ADDED Viewed

	@@ -0,0 +1,597 @@

+# mypy: allow-untyped-defs
+import itertools
+from collections.abc import Iterable
+from typing import Any, Callable, Optional, Union
+import sympy
+from sympy.parsing.sympy_parser import parse_expr
+import torch
+from torch._inductor.utils import do_bench_using_profiling
+from torch.utils._ordered_set import OrderedSet
+from torch.utils._sympy.symbol import SymT
+from .. import config, cpp_builder, ir, lowering as L
+from ..autotune_process import CppBenchmarkRequest
+from ..loop_body import LoopBody
+from ..select_algorithm import PartialRender
+from ..utils import sympy_index_symbol, sympy_index_symbol_with_prefix
+from ..virtualized import V
+from .common import REMOVED
+from .cpp import CppKernel, CppKernelProxy, KernelGroup
+from .cpp_utils import cexpr_index, DTYPE_TO_CPP, LocalBufferContext
+def parse_expr_with_index_symbols(expr):
+    if isinstance(expr, sympy.Expr):
+        return expr
+    elif isinstance(expr, (list, tuple)):
+        return [parse_expr_with_index_symbols(e) for e in expr]
+    else:
+        expr = parse_expr(str(expr))
+        int_symbols = {sym: sympy_index_symbol(sym.name) for sym in expr.free_symbols}
+        return expr.subs(int_symbols)
+def wrap_with_tensorbox(node) -> ir.TensorBox:
+    return (
+        ir.TensorBox.create(node) if isinstance(node, ir.Buffer) else ir.TensorBox(node)
+    )
+class CppTemplateKernel(CppKernel):
+    def __init__(self, kernel_name, num_threads):
+        super().__init__(None, num_threads)
+        self.kernel_name = kernel_name
+        self.render_hooks = {}
+        self.local_buffers = {}
+    def render(self, template, **kwargs):
+        return PartialRender(
+            template.render(kernel=self, **kwargs), self.render_hooks
+        ).finalize_all()
+    def def_kernel(
+        self,
+        inputs: dict[str, ir.Buffer],
+        outputs: dict[str, ir.Buffer],
+        aliases: Optional[dict[str, str]] = None,
+        function_name: str = "",
+        extra_sizevars: Optional[list[sympy.Expr]] = None,
+        placeholder: str = "<DEF_KERNEL>",
+    ) -> str:
+        if len(function_name) == 0:
+            function_name = str(self.kernel_name)
+        for name, inp in inputs.items():
+            if inp is not None:
+                self.args.input_buffers[inp.get_name()] = name
+        for name, out in outputs.items():
+            self.args.output_buffers[out.get_name()] = name
+        if aliases is not None:
+            for alias, orig in aliases.items():
+                if orig in self.args.input_buffers:
+                    self.args.input_buffers[alias] = self.args.input_buffers[orig]
+                if orig in self.args.output_buffers:
+                    self.args.output_buffers[alias] = self.args.output_buffers[orig]
+        unique_sizevars = OrderedSet(
+            s
+            for input in inputs.values()
+            if input is not None
+            for sym in itertools.chain(input.get_size(), input.get_stride())
+            if isinstance(sym, sympy.Expr)
+            for s in sym.free_symbols
+        )
+        unique_sizevars.update(
+            s
+            for sym in extra_sizevars or []
+            if isinstance(sym, sympy.Expr)
+            for s in sym.free_symbols
+        )
+        unique_sizevars.update(
+            s
+            for output in outputs.values()
+            for sym in itertools.chain(output.get_size(), output.get_stride())
+            if isinstance(sym, sympy.Expr)
+            for s in sym.free_symbols
+        )
+        sizevars = sorted(unique_sizevars, key=str)
+        for sizevar in sizevars:
+            self.args.sizevars[sizevar] = f"k{sizevar}"
+        def hook():
+            # remove all aliases before generate function definition
+            if aliases is not None:
+                for alias in aliases:
+                    if alias in self.args.input_buffers:
+                        raise AssertionError(
+                            f"input_buffers cannot be removed: {alias}"
+                        )
+                    if alias in self.args.output_buffers:
+                        self.args.output_buffers[alias] = REMOVED
+            cpp_argdefs, _, _ = self.args.cpp_argdefs()
+            return f"void {function_name}({', '.join(cpp_argdefs)})"
+        assert placeholder not in self.render_hooks
+        self.render_hooks[placeholder] = hook
+        return placeholder
+    def call_kernel(self, name: str, node: ir.CppTemplateBuffer):
+        wrapper = V.graph.wrapper_code
+        _, call_args, arg_types = self.args.cpp_argdefs()
+        wrapper.generate_kernel_call(name, call_args, triton=False, arg_types=arg_types)
+    def dtype(self, node: ir.Buffer) -> str:
+        return DTYPE_TO_CPP[node.get_dtype()]
+    def acc_dtype(self, node: ir.Buffer) -> str:
+        if node.get_dtype() in [torch.float32, torch.bfloat16, torch.half]:
+            return "float"
+        else:
+            raise NotImplementedError(f"Unsupported dtype: {node.get_dtype()}")
+    def size(self, node: ir.Buffer, dim: int) -> str:
+        return cexpr_index(self.rename_indexing(node.get_size()[dim]))
+    def stride(self, node: ir.Buffer, dim: int) -> str:
+        return cexpr_index(self.rename_indexing(node.get_stride()[dim]))
+    def index(self, node: ir.Buffer, indices: list[Any]) -> str:
+        indexer = node.get_layout().as_fixed().make_indexer()
+        index = indexer(parse_expr_with_index_symbols(indices))
+        index = self.rename_indexing(index)
+        outer_name = node.get_name()
+        inner_name = (
+            outer_name
+            if outer_name in self.local_buffers
+            else self.args.input(node.get_name())
+        )
+        return f"{inner_name}[{cexpr_index(index)}]"
+    def slice_nd(self, node, ranges: list[tuple[Any, Any]]) -> ir.ReinterpretView:
+        """
+        Slice the given node with a list of ranges (start and end) corresponding to its dims.
+        The dim is not sliced if the corresponding range is empty.
+        """
+        assert len(ranges) == len(node.get_size()), f"{ranges=}, {node=}"
+        sliced = wrap_with_tensorbox(node)
+        for dim, _range in enumerate(ranges):
+            if len(_range) == 0:
+                continue
+            assert len(_range) == 2
+            start, end = parse_expr_with_index_symbols(_range)
+            sliced = L.slice_(sliced, dim, start, end, clamp=False)
+        assert isinstance(sliced.data, ir.ReinterpretView), sliced.data
+        return sliced.data
+    def select(self, node, dim: int, idx: int) -> ir.ReinterpretView:
+        # We avoid using L.select here because we need clamp=False so the dim after slicing
+        # is 1 instead of a sympy expression of symbol - dim_size.
+        node = wrap_with_tensorbox(node)
+        idx = ir.View.handle_negative_index(idx, node.get_size()[dim])
+        sliced = L.squeeze(L.slice_(node, dim, idx, idx + 1, clamp=False), dim)
+        assert isinstance(sliced.data, ir.ReinterpretView), sliced.data
+        return sliced.data
+    def view(self, node, sizes: list[Any]) -> ir.View:
+        node = wrap_with_tensorbox(node)
+        sizes = parse_expr_with_index_symbols(sizes)
+        return L.view(node, sizes).data
+    def permute(self, node, dims):
+        node = wrap_with_tensorbox(node)
+        permuted = L.permute(node, dims).data
+        assert isinstance(permuted, ir.ReinterpretView)
+        return permuted
+    def maybe_codegen_profile(self) -> str:
+        if config.cpp.enable_kernel_profile:
+            graph_id = V.graph.graph_id
+            prefix = "graph_" + str(graph_id) + "_" if graph_id is not None else ""
+            return f'RECORD_FUNCTION("{prefix}{self.kernel_name}", c10::ArrayRef<c10::IValue>({{}}));'
+        else:
+            return ""
+    def unroll_pragma(self, unroll):
+        if cpp_builder.is_gcc():
+            return f"#pragma GCC unroll {unroll}"
+        else:
+            return f"#pragma unroll {unroll}"
+    def define_buffer(self, name, sizes: list[Any], dtype=torch.float) -> str:
+        """Define kernel local buffer"""
+        sizes = parse_expr_with_index_symbols(sizes)
+        buf = ir.Buffer(
+            name=name, layout=ir.FixedLayout(torch.device("cpu"), dtype, sizes)
+        )
+        self.local_buffers[name] = buf
+        ctype = f"{DTYPE_TO_CPP[dtype]}"
+        numel = f"{cexpr_index(buf.get_numel())}"
+        return f"auto _{name} = std::make_unique<{ctype}[]>({numel}); auto {name} = _{name}.get();"
+    def define_stack_allocated_buffer(
+        self, name, sizes: list[Any], dtype=torch.float
+    ) -> str:
+        """Define stack-allocated buffer"""
+        sizes = parse_expr_with_index_symbols(sizes)
+        buf = ir.Buffer(
+            name=name, layout=ir.FixedLayout(torch.device("cpu"), dtype, sizes)
+        )
+        self.local_buffers[name] = buf
+        ctype = f"{DTYPE_TO_CPP[dtype]}"
+        numel = f"{cexpr_index(buf.get_numel())}"
+        return f"alignas(64) {ctype} _{name}[{numel}]; {ctype}* {name} = _{name};"
+    def reinit_buffer_if_null(self, name):
+        """Reinit the previously defined local buffer if it is null"""
+        assert name in self.local_buffers
+        buf = self.local_buffers[name]
+        ctype = f"{DTYPE_TO_CPP[buf.layout.dtype]}"
+        numel = f"{cexpr_index(buf.get_numel())}"
+        return f"if (_{name} == nullptr) {{ _{name} = std::make_unique<{ctype}[]>({numel}); {name} = _{name}.get(); }}"
+    def release_buffer(self, name):
+        """Codegen the code to release the ownership of a local buffer to others"""
+        assert name in self.local_buffers
+        return f"_{name}.release()"
+    def store_pointwise_nodes(
+        self,
+        dst: ir.Buffer,
+        nodes: list[ir.IRNode],
+        offsets: Optional[list[sympy.Expr]] = None,
+        reindexers: Optional[list[Optional[Callable[[list[Any]], list[Any]]]]] = None,
+    ) -> str:
+        var_sizes = (tuple(dst.get_size()), ())
+        var_ranges = {
+            sympy_index_symbol_with_prefix(SymT.INDEX, i): sz
+            for i, sz in enumerate(var_sizes[0])
+        }
+        if not offsets:
+            offsets = [sympy.S.Zero] * len(var_sizes[0])
+        if not reindexers:
+            reindexers = [None] * len(nodes)
+        assert len(offsets) == len(var_sizes[0])
+        output_index = dst.get_layout().make_indexer()([*var_ranges.keys()])
+        kernel_group = KernelGroup()
+        kernel_group.args = self.args
+        cpp_kernel_proxy = CppKernelProxy(kernel_group)
+        bodies = []
+        var_sizes_list = []
+        for i, node in enumerate(nodes):
+            output_name = node.get_name() if i < len(nodes) - 1 else dst.get_name()
+            node = node.data if isinstance(node, ir.ComputedBuffer) else node
+            assert isinstance(node, ir.Pointwise), node
+            def fn(*args):
+                assert len(args) == 2
+                assert len(args[0]) == len(var_sizes[0])
+                assert len(args[1]) == 0
+                new_args = [arg + offset for arg, offset in zip(args[0], offsets)]  # type: ignore[arg-type]
+                if reindexers[i] is not None:
+                    new_args = reindexers[i](new_args)  # type: ignore[misc]
+                V.ops.store(
+                    output_name,
+                    output_index,
+                    node.make_loader()(new_args).value,
+                )
+            body = LoopBody(
+                fn,
+                (list(var_ranges.keys()), ()),
+                var_ranges,
+                list(var_ranges.keys()),
+                tuple(),
+            )
+            bodies.append(body)
+            var_sizes_list.append(var_sizes)
+        cpp_kernel_proxy.codegen_loop_bodies(bodies, var_sizes_list)
+        kernel_group.finalize_kernel(cpp_kernel_proxy, [])
+        return kernel_group.loops_code.getvalue()
+    def store_grouped_gemm_pointwise_nodes(
+        self,
+        dst: tuple[ir.Buffer],
+        nodes: list[ir.IRNode],
+        offsets: list[sympy.Expr],
+        reindexers: list[Optional[Callable[[list[Any]], list[Any]]]],
+        output_names: list[str],
+    ) -> str:
+        ref_dst = dst[0]
+        var_sizes = (tuple(ref_dst.get_size()), ())
+        var_ranges = {
+            sympy_index_symbol_with_prefix(SymT.INDEX, i): sz
+            for i, sz in enumerate(var_sizes[0])
+        }
+        assert offsets, "offsets should be set outside"
+        assert all(len(offset) == len(var_sizes[0]) for offset in offsets)
+        output_index = ref_dst.get_layout().make_indexer()([*var_ranges.keys()])
+        kernel_group = KernelGroup()
+        kernel_group.args = self.args
+        cpp_kernel_proxy = CppKernelProxy(kernel_group)
+        bodies = []
+        var_sizes_list = []
+        for i, node in enumerate(nodes):
+            output_name = output_names[i]
+            node = node.data if isinstance(node, ir.ComputedBuffer) else node
+            assert isinstance(node, ir.Pointwise), node
+            def fn(*args):
+                assert len(args) == 2
+                assert len(args[0]) == len(var_sizes[0])
+                assert len(args[1]) == 0
+                new_args = [arg + offset for arg, offset in zip(args[0], offsets[i])]  # type: ignore[arg-type]
+                if reindexers[i] is not None:
+                    new_args = reindexers[i](new_args)  # type: ignore[misc]
+                V.ops.store(
+                    output_name,
+                    output_index,
+                    node.make_loader()(new_args).value,
+                )
+            body = LoopBody(
+                fn,
+                (list(var_ranges.keys()), ()),
+                var_ranges,
+                list(var_ranges.keys()),
+                tuple(),
+            )
+            bodies.append(body)
+            var_sizes_list.append(var_sizes)
+        cpp_kernel_proxy.codegen_loop_bodies(bodies, var_sizes_list)
+        kernel_group.finalize_kernel(cpp_kernel_proxy, [])
+        return kernel_group.loops_code.getvalue()
+    def store_output(
+        self,
+        dst: ir.Buffer,
+        src: ir.Buffer,
+        orig_src: Optional[ir.Buffer] = None,
+        epilogue_nodes: Optional[list[ir.IRNode]] = None,
+        offsets: Optional[list[Any]] = None,
+        reindexers: Optional[list[Optional[Callable[[list[Any]], list[Any]]]]] = None,
+    ):
+        """
+        Store the `src` buffer to the `dst` buffer. The size of `src` and `dst` should match.
+        If `epilogue_nodes` is provided, the `src` buffer is firstly computed with the epilogues
+        before stored to `dst`. The `epilogues_nodes` are all pointwise.
+        Notes:
+        1. `src` and `dst` buffer could be the same buffer in which case we are doing in-place compute
+           and stores. In case `epilogue_nodes` are not provided, we do nothing.
+        2. The `epilogue_nodes`, if exist, have computations on `src` before storing to `dst` but since
+           they come form the original Inductor IR, they might need to be adjusted before working with
+           `src` and `dst` as outlined below:
+           a) `src` or `dst` buffer could be a sub-slice of the ranges the `epilogue_nodes`work on.
+              In this case, the `offsets` could be provided to adjust the indices passed to
+              `epilogue_nodes` during codegen and the data ranges are also configured according to
+              the sizes of `src` and `dst`.
+           b) `dst` might be indexed in a different way as the `epilogue_nodes`, hence a `reindexer` is
+              needed on the indices to `epilogue_nodes` to match the indexing of `dst`.
+           c) If `src` is local, we need to add a local buffer for it and localize the `orig_src` buffer
+              in `epilogue_nodes` with `src`.
+        """
+        assert isinstance(dst, (ir.Buffer, ir.ReinterpretView))
+        assert dst.get_size() == src.get_size(), f"{dst=}, {src=}"
+        if offsets:
+            offsets = parse_expr_with_index_symbols(offsets)
+        if epilogue_nodes:
+            with LocalBufferContext(self.args) as scope:
+                assert orig_src is not None
+                if orig_src.get_name() != src.get_name():
+                    scope.add_local_buffer(
+                        src,
+                        [
+                            orig_src,
+                        ],
+                    )
+                    epilogue_nodes = scope.localize_nodes(epilogue_nodes)
+                return self.store_pointwise_nodes(
+                    dst,
+                    epilogue_nodes,  # type: ignore[arg-type]
+                    offsets,
+                    reindexers,
+                )
+        else:
+            if dst.get_name() != src.get_name():
+                # src is local
+                copy = L.copy(dst, src).data.data
+                with LocalBufferContext(self.args) as scope:
+                    scope.add_local_buffer(src)
+                    return self.store_pointwise_nodes(dst, [copy])
+            else:
+                assert dst.layout == src.layout, f"{dst=}, {src=}"
+                return ""
+    def store_outputs(
+        self,
+        dst: tuple[ir.Buffer],
+        src: tuple[ir.IRNode],
+        orig_src: Optional[tuple[ir.IRNode]] = None,
+        epilogue_nodes: Optional[list[ir.IRNode]] = None,
+        offsets: Optional[list[Any]] = None,
+        reindexers: Optional[list[Optional[Callable[[list[Any]], list[Any]]]]] = None,
+        multi_output_buffers: Optional[tuple[ir.MultiOutput]] = None,
+    ):
+        assert isinstance(dst, Iterable)
+        assert all(_dst.get_size() == _src.get_size() for _src, _dst in zip(src, dst))
+        if offsets:
+            offsets = parse_expr_with_index_symbols(offsets)
+        gemm_num = len(src)
+        final_offsets = []
+        output_names = []
+        if epilogue_nodes:
+            if not reindexers:
+                reindexers = [None] * len(epilogue_nodes)
+            with LocalBufferContext(self.args) as scope:
+                assert orig_src is not None
+                localize_epilogue_nodes = []
+                all_read_names = []
+                for epilogue in epilogue_nodes:
+                    all_read_names.extend(list(epilogue.get_read_names()))
+                localize_epilogue_nodes.extend(scope.localize_nodes(epilogue_nodes))
+                final_offsets.extend([offsets] * len(localize_epilogue_nodes))
+                output_names.extend(
+                    [node.get_name() for node in localize_epilogue_nodes]
+                )
+                for gemm_idx in range(gemm_num):
+                    if orig_src[gemm_idx].get_name() != src[gemm_idx].get_name():
+                        if orig_src[gemm_idx].get_name() in all_read_names or (
+                            multi_output_buffers
+                            and multi_output_buffers[gemm_idx].get_name()
+                            in all_read_names
+                        ):
+                            # If any of the Epilogue nodes use this GEMM output, let's localize the GEMM output
+                            global_buffers = [orig_src[gemm_idx]]
+                            if (
+                                multi_output_buffers
+                                and multi_output_buffers[gemm_idx].get_name()
+                                in all_read_names
+                                and orig_src[gemm_idx].get_name() not in all_read_names
+                            ):
+                                # Epilogue might directly read the MultiOutput, Locallize MultiOutput to the local Buffer
+                                # if this MultiOutput has not been stored by in-template epilogue
+                                # otherwise, use the cse store cache if it will be stored before used
+                                global_buffers.append(multi_output_buffers[gemm_idx])
+                            scope.add_local_buffer(
+                                src[gemm_idx],
+                                global_buffers,
+                            )
+                        else:
+                            scope.add_local_buffer(src[gemm_idx])
+                            localize_epilogue_nodes.extend(
+                                [L.copy(dst[gemm_idx], src[gemm_idx]).data.data]
+                            )
+                            reindexers.append(None)
+                            output_names.append(dst[gemm_idx].get_name())
+                            final_offsets.append(
+                                [sympy.S.Zero] * len(dst[gemm_idx].get_size())
+                            )
+                res = self.store_grouped_gemm_pointwise_nodes(
+                    dst,
+                    localize_epilogue_nodes,
+                    final_offsets,
+                    reindexers,
+                    output_names=output_names,
+                )
+                for gemm_idx in range(gemm_num):
+                    if (
+                        multi_output_buffers
+                        and multi_output_buffers[gemm_idx].get_name() in all_read_names
+                    ):
+                        # If the MultiOutput is used in the Epilogue, let's remove it from args
+                        multi_output_name = multi_output_buffers[gemm_idx].get_name()
+                        if (
+                            multi_output_name in self.args.output_buffers
+                            and self.args.output_buffers[multi_output_name]
+                            is not REMOVED
+                        ):
+                            self.remove_buffer(multi_output_name)
+                return res
+        else:
+            if dst[0].get_name() != src[0].get_name():
+                copy_list = []
+                with LocalBufferContext(self.args) as scope:
+                    for _src, _dst in zip(src, dst):
+                        copy_list.extend([L.copy(_dst, _src).data.data])
+                        scope.add_local_buffer(_src)
+                        output_names.append(_dst.get_name())
+                        final_offsets.append([sympy.S.Zero] * len(_dst.get_size()))
+                    reindexers = [None] * len(copy_list)
+                    return self.store_grouped_gemm_pointwise_nodes(
+                        dst,
+                        nodes=copy_list,
+                        offsets=final_offsets,
+                        reindexers=reindexers,
+                        output_names=output_names,
+                    )
+            else:
+                assert all(
+                    _src.get_name() == _dst.get_name() for _src, _dst in zip(src, dst)
+                )
+                assert all(
+                    _src.get_layout() == _dst.get_layout()
+                    for _src, _dst in zip(src, dst)
+                )
+                return ""
+    def check_bounds(self, expr, size, lower, upper):
+        # CppTemplateKernel does not need codegen related operations
+        return
+class CppTemplateCaller(ir.ChoiceCaller):
+    """
+    CppTemplateCaller
+    This class represents a caller for CPP template kernels. It is a subclass of ir.ChoiceCaller.
+    Attributes:
+        name (str): The name of the caller.
+        category (str): The category of the caller.
+        bmreq (CppBenchmarkRequest): The benchmark request for the caller.
+        template_buffer (ir.CppTemplateBuffer): The template buffer for the caller.
+    """
+    def __init__(
+        self,
+        name: str,
+        category: str,
+        input_nodes: list[ir.Buffer],
+        layout: ir.Layout,
+        make_kernel_render: Callable[
+            [
+                ir.CppTemplateBuffer,
+                bool,
+                Optional[list[ir.IRNode]],
+            ],
+            str,
+        ],
+        bmreq: CppBenchmarkRequest,
+        template: "CppTemplate",  # type: ignore[name-defined]  # noqa: F821
+        info_kwargs: Optional[
+            dict[str, Union[ir.PrimitiveInfoType, list[ir.PrimitiveInfoType]]]
+        ] = None,
+    ):
+        super().__init__(name, input_nodes, layout, description="")
+        self.category = category
+        self.make_kernel_render = make_kernel_render
+        self.bmreq = bmreq
+        self.template = template
+        self.info_kwargs = info_kwargs
+    def precompile(self) -> None:
+        assert self.bmreq is not None
+        self.bmreq.precompile()
+    def benchmark(self, *args, out) -> float:
+        assert self.bmreq is not None
+        if config.profile_bandwidth_with_do_bench_using_profiling:
+            algo = self.bmreq.make_run_fn(*args, out=out)
+            return do_bench_using_profiling(algo)
+        return self.bmreq.benchmark(*args, out=out)
+    def hash_key(self) -> str:
+        return "-".join(
+            [
+                self.category,
+                self.bmreq.hash_key,
+            ]
+        )
+    def info_dict(
+        self,
+    ) -> dict[str, Union[ir.PrimitiveInfoType, list[ir.PrimitiveInfoType]]]:
+        return {"backend": "CPP", "op_type": "unknown"}
+    def output_node(self) -> ir.TensorBox:
+        return ir.TensorBox.create(
+            ir.CppTemplateBuffer(
+                layout=self.layout,
+                inputs=self.input_nodes,
+                make_kernel_render=self.make_kernel_render,
+                template=self.template,
+                choice=self,
+            )
+        )

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_utils.py ADDED Viewed

	@@ -0,0 +1,776 @@

+# mypy: allow-untyped-defs
+import contextlib
+import dataclasses
+import functools
+import math
+import sys
+from collections import namedtuple
+from collections.abc import Sequence
+from typing import Any, Callable, Optional
+from unittest.mock import patch
+import sympy
+import torch
+from torch._prims_common import is_integer_dtype
+from torch.utils._ordered_set import OrderedSet
+from torch.utils._sympy.printers import CppPrinter as _CppPrinter
+from torch.utils._sympy.symbol import symbol_is_type, SymT
+from torch.utils._sympy.value_ranges import ValueRanges
+from .. import ir
+from ..dependencies import Dep
+from ..loop_body import LoopBody
+from ..scheduler import BaseSchedulerNode, SchedulerBuffer
+from ..utils import IndentedBuffer, sympy_index_symbol_with_prefix, sympy_subs
+from ..virtualized import ops, OpsValue, V
+from .common import CSEVariable, Kernel, KernelArgs, OptimizationContext
+DTYPE_TO_CPP = {
+    torch.float32: "float",
+    torch.float64: "double",
+    torch.float16: "at::Half",
+    torch.int64: "int64_t",
+    torch.int32: "int32_t",
+    torch.int16: "int16_t",
+    torch.int8: "int8_t",
+    torch.uint64: "uint64_t",
+    torch.uint32: "uint32_t",
+    torch.uint16: "uint16_t",
+    torch.uint8: "uint8_t",
+    torch.bool: "bool",
+    torch.bfloat16: "at::BFloat16",
+    torch.complex32: "at::complex<at::Half>",
+    torch.complex64: "at::complex<float>",
+    torch.complex128: "at::complex<double>",
+    torch.float8_e4m3fn: "at::Float8_e4m3fn",
+    torch.float8_e5m2: "at::Float8_e5m2",
+    torch.float8_e4m3fnuz: "at::Float8_e4m3fnuz",
+    torch.float8_e5m2fnuz: "at::Float8_e5m2fnuz",
+}
+DTYPE_TO_ATEN = {
+    torch.float32: "at::kFloat",
+    torch.float64: "at::kDouble",
+    torch.float16: "at::kHalf",
+    torch.int64: "at::kLong",
+    torch.int32: "at::kInt",
+    torch.int16: "at::kShort",
+    torch.int8: "at::kChar",
+    torch.uint64: "at::kUInt64",
+    torch.uint32: "at::kUInt32",
+    torch.uint16: "at::kUInt16",
+    torch.uint8: "at::kByte",
+    torch.uint32: "at::kUInt32",
+    torch.uint64: "at::kUInt64",
+    torch.bool: "at::kBool",
+    torch.bfloat16: "at::kBFloat16",
+    torch.complex32: "at::kComplexHalf",
+    torch.complex64: "at::kComplexFloat",
+    torch.complex128: "at::kComplexDouble",
+    torch.float8_e4m3fn: "at::kFloat8_e4m3fn",
+    torch.float8_e5m2: "at::kFloat8_e5m2",
+    torch.float8_e4m3fnuz: "at::kFloat8_e4m3fnuz",
+    torch.float8_e5m2fnuz: "at::kFloat8_e5m2fnuz",
+}
+DEVICE_TO_ATEN = {
+    "meta": "at::kMeta",
+    "cpu": "at::kCPU",
+    "cuda": "at::kCUDA",
+    "xpu": "at::kXPU",
+    "mps": "at::kMPS",
+}
+LAYOUT_TO_ATEN = {
+    torch.strided: "at::kStrided",
+    torch._mkldnn: "at::kMkldnn",  # type: ignore[attr-defined]
+}
+# matches c10/core/DeviceType.h
+DEVICE_TO_INT = {"cpu": 0, "cuda": 1}
+_IS_WINDOWS = sys.platform == "win32"
+INDEX_TYPE = "int64_t"
+GemmBlocking = namedtuple("GemmBlocking", ["block_m", "block_n", "block_k"])
+def get_promote_dtype(args):
+    return (
+        functools.reduce(
+            torch.promote_types,  # type: ignore[arg-type]
+            [n.dtype for n in args if isinstance(n, CppCSEVariable)],
+        )
+        if all(n.dtype is not None for n in args if isinstance(n, CppCSEVariable))
+        else None  # not enough info to calculate the promote dtype
+    )
+def promote_args(new_args):
+    def promote_arg(arg, promote_type):
+        if (
+            isinstance(arg, CppCSEVariable)
+            and arg.dtype
+            and promote_type
+            and arg.dtype != promote_type
+        ):
+            arg = ops.to_dtype(arg, promote_type)
+            arg = arg.value if isinstance(arg, OpsValue) else arg
+            arg.dtype = promote_type
+        return arg
+    promote_type = get_promote_dtype(new_args)
+    promote_fn = functools.partial(
+        promote_arg,
+        promote_type=promote_type,
+    )
+    if (
+        all(
+            new_arg.dtype is not None
+            for new_arg in new_args
+            if isinstance(new_arg, CppCSEVariable)
+        )
+        and promote_type
+    ):
+        new_args = list(map(promote_fn, new_args))
+    return new_args
+class CppCSEVariable(CSEVariable):
+    def __init__(
+        self,
+        name,
+        bounds: ValueRanges[Any],
+        dtype: Optional[torch.dtype] = None,
+    ) -> None:
+        super().__init__(name, bounds, dtype)
+        self.is_vec = False
+        self.dependent_itervars = OrderedSet[sympy.Symbol]()
+    def __repr__(self) -> str:
+        return (
+            f"CppCSEVariable(name: {self.name}, bounds: {self.bounds}, is_vec: {self.is_vec}, dtype: {self.dtype}, "
+            f"dependent_itervars: {self.dependent_itervars})"
+        )
+    def update_on_args(self, name, args, kwargs):
+        if name == "load":
+            # args[2] is index
+            self._set_dependent_itervars(args[2])
+        else:
+            # propagate relevant itervars and is_vec from args
+            self.dependent_itervars.update(
+                *[
+                    arg.dependent_itervars
+                    for arg in args
+                    if isinstance(arg, CppCSEVariable)
+                ]
+            )
+            if name == "index_expr":
+                self._set_dependent_itervars(args[0])
+            if any(arg.is_vec for arg in args if isinstance(arg, CppCSEVariable)):
+                self.is_vec = True
+    def _set_dependent_itervars(self, index: sympy.Expr):
+        """
+        Set the relevant itervars for this variable based on the `index` expression.
+        This includes the itervars directly used in the `index` as well as relevant itervars
+        of other cse variables used in the `index`.
+        """
+        for s in index.free_symbols:
+            if s in V.kernel.itervars:
+                self.dependent_itervars.add(s)  # type: ignore[arg-type]
+            elif s.name in V.kernel.cse.varname_map:  # type: ignore[attr-defined]
+                self.dependent_itervars.update(
+                    V.kernel.cse.varname_map[s.name].dependent_itervars  # type: ignore[attr-defined]
+                )
+    def depends_on(self, itervar: sympy.Symbol):
+        return itervar in self.dependent_itervars
+class CppPrinter(_CppPrinter):
+    def doprint(self, expr, *, simplify: bool = True, p=True):
+        # TODO: why are people passing strings to the printer here :think:
+        if simplify and isinstance(expr, sympy.Expr) and hasattr(V.graph, "sizevars"):
+            expr = V.graph.sizevars.simplify(expr)
+        return super().doprint(expr)
+# A function to print, useful for printing sympy symbols.
+cexpr = CppPrinter().doprint
+def cexpr_index(index):
+    return f"static_cast<{INDEX_TYPE}>({cexpr(index)})"
+def value_to_cpp(value, cpp_type):
+    if value == float("-inf"):
+        return f"-std::numeric_limits<{cpp_type}>::infinity()"
+    elif value == float("inf"):
+        return f"std::numeric_limits<{cpp_type}>::infinity()"
+    elif isinstance(value, bool):
+        return f"static_cast<{cpp_type}>({str(value).lower()})"
+    elif math.isnan(value):
+        return f"std::numeric_limits<{cpp_type}>::quiet_NaN()"
+    else:
+        return f"static_cast<{cpp_type}>({repr(value)})"
+def rewrite_index_for_function(
+    localize_buffer_handler: "LocalizeBufferHandler",
+    index: sympy.Expr,
+    global_buf_name: str,
+):
+    # Local buffer at the inner dimensions
+    snode = V.graph.scheduler.name_to_buf[global_buf_name].defining_op
+    assert snode is not None
+    local_buf = localize_buffer_handler.global_to_local[global_buf_name]
+    scheduler_nodes = snode.get_nodes()
+    _, (group, reduction_group) = max(
+        scheduler_nodes, key=lambda x: int(x.is_reduction())
+    ).group
+    call_ranges = tuple(group) + tuple(reduction_group)
+    indices_to_keep = [
+        f"x{len(call_ranges) - (idx + 1)}"
+        for idx in range(len(local_buf.get_layout().size))
+    ]
+    sorted_symbols = sorted(index.free_symbols, key=lambda s: s.name)  # type: ignore[attr-defined]
+    replacements = {}
+    for x in sorted_symbols:
+        if x.name.startswith("x") and x.name not in indices_to_keep:  # type: ignore[attr-defined]
+            # Only keep index used by local buffer
+            replacements[x] = sympy.core.numbers.Zero()
+    index = sympy_subs(index, replacements)  # type: ignore[arg-type]
+    return index
+def rewrite_index_for_nodes(
+    localize_buffer_handler: "LocalizeBufferHandler",
+    index: sympy.Expr,
+    global_buf_name: str,
+):
+    used_vars = OrderedSet(
+        s for s in index.free_symbols if symbol_is_type(s, SymT.INDEX)
+    )
+    index_vars = []
+    local_buf = localize_buffer_handler.global_to_local[global_buf_name]
+    for i in range(len(local_buf.get_size())):
+        var = sympy_index_symbol_with_prefix(SymT.INDEX, i)
+        index_vars.append(var if var in used_vars else 0)
+    index = local_buf.get_layout().make_indexer()(index_vars)
+    return index
+class LocalizeBufferHandler(V.WrapperHandler):  # type: ignore[name-defined]
+    def __init__(
+        self,
+        inner,
+        global_to_local: dict[str, ir.Buffer],
+        rewrite_index: Callable[["LocalizeBufferHandler", sympy.Expr, str], sympy.Expr],
+    ) -> None:
+        super().__init__(inner)
+        self.global_to_local = global_to_local
+        self.rewrite_index = rewrite_index
+    def localize(self, name: str, index: sympy.Expr):
+        if self.global_to_local and name in self.global_to_local:
+            assert self.rewrite_index is not None
+            index = self.rewrite_index(self, index, name)
+            name = self.global_to_local[name].get_name()
+        return name, index
+    def load(self, name: str, index: sympy.Expr):
+        return self._inner.load(*self.localize(name, index))
+    def store(self, name, index, value, mode=None):
+        local_buffer_name, local_buffer_index = self.localize(name, index)
+        res = self._inner.store(local_buffer_name, local_buffer_index, value, mode)
+        if (
+            self.global_to_local
+            and name in self.global_to_local
+            and isinstance(V.kernel, Kernel)
+        ):
+            # Remove name of local buffer from Kernel.store_buffer_names
+            # local_buffer_name is added to Kernel.store_buffer_names in Kernel.CSEProxy.store.
+            V.kernel.store_buffer_names.discard(local_buffer_name)
+        return res
+    def store_reduction(self, name, index, value):
+        return self._inner.store_reduction(*self.localize(name, index), value)
+class LocalBufferContext:
+    """
+    This class creates a context that helps to generate code involving Inductor IR with
+    function local buffers. These buffers are constructed during the codegen process and
+    are used to store intermediate results such as local accumulators. We do not want to
+    add them to `V.graph` since they are not global and we do not want to add them as
+    function arguments either. So we patch the codegen processes under this scope to support
+    these buffers without exposure to the outside world.
+    """
+    def __init__(self, kernel_args: KernelArgs) -> None:
+        self.kernel_args = kernel_args
+        self.exit_stack = contextlib.ExitStack()
+        # map local buffer name to local buffer
+        self.local_buffers: dict[str, ir.Buffer] = {}
+        # map global buffer name to global buffer
+        self.global_buffers: dict[str, ir.Buffer] = {}
+        # map global buffer name to local buffer
+        self.global_to_local: dict[str, ir.Buffer] = {}
+        # record the global buffers that are removed by this LocalBufferContext
+        self.removed_buffers: OrderedSet[str] = OrderedSet()
+    def __enter__(self):
+        self.exit_stack.__enter__()
+        original_get_dtype = V.graph.get_dtype
+        def get_dtype(name):
+            if name in self.local_buffers:
+                return self.local_buffers[name].get_dtype()
+            return original_get_dtype(name)
+        self.exit_stack.enter_context(patch.object(V.graph, "get_dtype", get_dtype))
+        original_input = self.kernel_args.input
+        def input(name):
+            if name in self.local_buffers:
+                return name
+            return original_input(name)
+        self.exit_stack.enter_context(patch.object(self.kernel_args, "input", input))
+        original_output = self.kernel_args.output
+        def output(name):
+            if name in self.local_buffers:
+                return name
+            return original_output(name)
+        self.exit_stack.enter_context(patch.object(self.kernel_args, "output", output))
+        # Set current LocalBufferContext into V
+        self.exit_stack.enter_context(V.set_local_buffer_context(self))
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.local_buffers.clear()
+        self.exit_stack.__exit__(exc_type, exc_val, exc_tb)
+    def add_local_buffer(
+        self, local_buffer: ir.Buffer, global_buffers: Optional[list[ir.Buffer]] = None
+    ):
+        assert local_buffer.get_name() not in self.local_buffers
+        self.local_buffers[local_buffer.get_name()] = local_buffer
+        if global_buffers:
+            for global_buffer in global_buffers:
+                global_buffer_name = global_buffer.get_name()
+                assert (
+                    global_buffer_name not in self.global_buffers
+                    and global_buffer_name not in self.global_to_local
+                )
+                self.global_buffers[global_buffer_name] = global_buffer
+                self.global_to_local[global_buffer_name] = local_buffer
+                if global_buffer_name not in V.graph.removed_buffers:
+                    # Record the global buffers that are removed by this LocalBufferContext
+                    # since which may need to restore. Refer to issue:
+                    # https://github.com/pytorch/pytorch/issues/144186
+                    self.removed_buffers.add(global_buffer_name)
+                    V.graph.removed_buffers.add(global_buffer_name)
+    def localize_function(
+        self,
+        fn: Callable[..., Any],
+        rewrite_index: Callable[
+            ["LocalizeBufferHandler", sympy.Expr, str], sympy.Expr
+        ] = rewrite_index_for_function,
+    ):
+        def inner(*args, **kwargs):
+            with V.set_ops_handler(
+                LocalizeBufferHandler(
+                    V.get_ops_handler(),
+                    global_to_local=self.global_to_local,
+                    rewrite_index=rewrite_index,
+                )
+            ):
+                return fn(*args, **kwargs)
+        return inner
+    def localize_nodes(
+        self,
+        nodes: list[ir.IRNode],
+        rewrite_index: Callable[
+            ["LocalizeBufferHandler", sympy.Expr, str], sympy.Expr
+        ] = rewrite_index_for_nodes,
+    ) -> list[ir.IRNode]:
+        """
+        Given `local_buf` and `global_buf` registered in current `LocalBufferContext`
+        though the method of `add_local_buffer`, localizes the `global_buf` to `local_buf`
+        for the given `nodes` and returns a new list of IR nodes that work on `local_buf`
+        instead of `global_buf`, i.e., all the loads and stores are redirected to
+        `local_buf`. This helps the fused loops to work on smaller-sized local buffers
+        for better data locality.
+        The the data access of `local_buf` is assumed to be contiguous with the
+        same order as the `global_buf`.
+        """
+        assert len(nodes) > 0
+        def wrap_inner_fn_for_node(node: ir.IRNode):
+            loops = node.data if isinstance(node, ir.ComputedBuffer) else node
+            assert isinstance(loops, ir.Loops)
+            new_inner_fn = self.localize_function(
+                loops.inner_fn,
+                rewrite_index,
+            )
+            new_loops = dataclasses.replace(loops, inner_fn=new_inner_fn)
+            if isinstance(node, ir.ComputedBuffer):
+                new_node = ir.ComputedBuffer(
+                    name=node.get_name(), layout=node.get_layout(), data=new_loops
+                )
+            else:
+                new_node = new_loops  # type: ignore[assignment]
+            return new_node
+        return [wrap_inner_fn_for_node(node) for node in nodes]
+def unify_mask_base_type(
+    buffer: IndentedBuffer,
+    vars: tuple[CSEVariable, ...],
+    dtype=torch.float,
+):
+    """
+    Given list of cse variables,
+    Cast each to new mask base dtype and return casted cse variable.
+    """
+    new_vars = (
+        V.kernel.cse.generate(
+            buffer,
+            f"{V.kernel._get_mask_cast(var, dtype)}",
+        )
+        for var in vars
+    )
+    return new_vars
+def may_unify_binary_op_mask_type(a, b):
+    """
+    Given two cse variables, when dtype is bool, unify them to the same mask dtype and return casted cse variable.
+    """
+    if a.dtype == torch.bool:
+        assert b.dtype == torch.bool
+        mask_dtype = torch.int32
+        return unify_mask_base_type(V.kernel.compute, (a, b), mask_dtype)
+    return a, b
+def codegen_rand(offset, code, rand_function, dst_dtype=torch.float32):
+    assert is_integer_dtype(offset.dtype)
+    code.writeline("[&]()")
+    with code.indent():
+        code.writeline(
+            f"{DTYPE_TO_CPP[offset.dtype]} offset[{V.kernel.tiling_factor}];"
+        )
+        code.writeline(f"{DTYPE_TO_CPP[dst_dtype]} result[{V.kernel.tiling_factor}];")
+        code.writeline(f"{offset}.store(offset);")
+        code.writeline(
+            f"for( {DTYPE_TO_CPP[offset.dtype]} offset_idx = 0; offset_idx < {V.kernel.tiling_factor}; offset_idx++ )"
+        )
+        with code.indent():
+            code.writeline(rand_function)
+        num_vectors = V.kernel._get_num_vectors(dtype=dst_dtype)
+        if num_vectors == 1:
+            code.writeline(
+                f"return at::vec::Vectorized<{DTYPE_TO_CPP[dst_dtype]}>::loadu(result);"
+            )
+        else:
+            code.writeline(
+                f"return at::vec::VectorizedN<{DTYPE_TO_CPP[dst_dtype]}, {num_vectors}>::loadu(result);"
+            )
+    code.writeline("()")
+    return code
+def get_gemm_template_output_and_compute_dtype(input_dtype):
+    if input_dtype in [torch.uint8, torch.int8]:
+        return (torch.int32, torch.int32)
+    else:
+        return (torch.float32, torch.float32)
+def create_epilogue_with_attr(input_buffer, attr, **kwargs):
+    input_loader = input_buffer.make_loader()
+    dtype = input_buffer.get_dtype()
+    if attr == "relu":
+        def inner_fn(index):
+            input = input_loader(index)
+            zero = ops.constant(0, dtype)
+            return ops.maximum(input, zero)
+    elif attr == "gelu":
+        assert "algorithm" in kwargs
+        if kwargs["algorithm"] == "none":
+            def inner_fn(index):
+                input = input_loader(index)
+                if dtype != torch.float:
+                    input = ops.to_dtype(input, torch.float)
+                half = ops.constant(0.5, torch.float)
+                one = ops.constant(1.0, torch.float)
+                const = ops.constant(0.7071067811865476, torch.float)
+                result = input * half * (ops.erf(input * const) + one)
+                if dtype != torch.float:
+                    result = ops.to_dtype(result, dtype)
+                return result
+        else:
+            assert kwargs["algorithm"] == "tanh"
+            def inner_fn(index):
+                input = input_loader(index)
+                if dtype != torch.float:
+                    input = ops.to_dtype(input, torch.float)
+                half = ops.constant(0.5, torch.float)
+                one = ops.constant(1.0, torch.float)
+                const1 = ops.constant(0.7978845608028654, torch.float)
+                const2 = ops.constant(0.044715, torch.float)
+                result = (
+                    half
+                    * input
+                    * (
+                        one
+                        + ops.tanh(const1 * (input + const2 * input * input * input))
+                    )
+                )
+                if dtype != torch.float:
+                    result = ops.to_dtype(result, dtype)
+                return result
+    elif attr == "swish":
+        def inner_fn(index):
+            input = input_loader(index)
+            result = input * ops.sigmoid(input)
+            return result
+    elif attr == "sigmoid":
+        def inner_fn(index):
+            return ops.sigmoid(input_loader(index))
+    elif attr == "tanh":
+        def inner_fn(index):
+            return ops.tanh(input_loader(index))
+    elif attr == "hardswish" or attr == "hardsigmoid":
+        def hardsigmoid_float(input):
+            zero = ops.constant(0, torch.float)
+            six = ops.constant(6, torch.float)
+            three = ops.constant(3, torch.float)
+            one_over_six = ops.constant(0.16666666666666666, torch.float)
+            max = ops.maximum(input + three, zero)
+            min = ops.minimum(max, six)
+            return min * one_over_six
+        def inner_fn(index):
+            input = input_loader(index)
+            if dtype != torch.float:
+                input = ops.to_dtype(input, torch.float)
+            result = hardsigmoid_float(input)
+            if attr == "hardswish":
+                result = input * result
+            if dtype != torch.float:
+                result = ops.to_dtype(result, dtype)
+            return result
+    elif attr == "leaky_relu":
+        assert "scalars" in kwargs
+        assert len(kwargs["scalars"]) == 1
+        negative_slope = kwargs["scalars"][0]
+        def inner_fn(index):
+            input = input_loader(index)
+            if dtype != torch.float:
+                input = ops.to_dtype(input, torch.float)
+            zero = ops.constant(0, torch.float)
+            result = ops.where(
+                input > zero, input, input * ops.constant(negative_slope, torch.float)
+            )
+            if dtype != torch.float:
+                result = ops.to_dtype(result, dtype)
+            return result
+    elif attr == "hardtanh":
+        assert "scalars" in kwargs
+        assert len(kwargs["scalars"]) == 2
+        min_value = kwargs["scalars"][0]
+        max_value = kwargs["scalars"][1]
+        def inner_fn(index):
+            input = input_loader(index)
+            if dtype != torch.float:
+                input = ops.to_dtype(input, torch.float)
+            result = ops.minimum(
+                ops.maximum(input, ops.constant(min_value, torch.float)),
+                ops.constant(max_value, torch.float),
+            )
+            if dtype != torch.float:
+                result = ops.to_dtype(result, dtype)
+            return result
+    elif attr in ["add", "sub", "mul"]:
+        assert "other" in kwargs
+        other = kwargs["other"]
+        num_input_dims = len(input_buffer.get_size())
+        num_other_dims = len(other.get_size())
+        dims_diff = num_input_dims - num_other_dims
+        other_loader = other.make_loader()
+        def inner_fn(index):
+            op = getattr(ops, attr)
+            if dims_diff != 0:
+                return op(input_loader(index), other_loader(index[dims_diff:]))
+            else:
+                return op(input_loader(index), other_loader(index))
+    elif attr == "bias_add":
+        assert "other" in kwargs
+        assert "beta" in kwargs
+        assert "dtype" in kwargs
+        beta = kwargs["beta"]
+        other = kwargs["other"]
+        dtype = kwargs["dtype"]
+        bias_loader = other.make_loader()
+        def inner_fn(index):
+            bias = bias_loader(index)
+            input = input_loader(index)
+            if beta != 1:
+                result = ops.constant(beta, torch.float) * bias + input
+            else:
+                result = bias + input
+            return result
+    else:
+        raise ValueError(f"Unsupported epilogue attribute: {attr}")
+    return ir.Pointwise(
+        device=input_buffer.get_device(),
+        dtype=dtype,
+        inner_fn=inner_fn,
+        ranges=input_buffer.get_size(),
+    )
+def _get_loop_body(fn_list):
+    if all(isinstance(fn, LoopBody) for fn in fn_list):
+        loop_bodies = fn_list
+    else:
+        if hasattr(fn_list[0], "original_fn"):
+            # For the case of local buffer, we wrap the fn with localize_function
+            assert all(hasattr(fn, "original_fn") for fn in fn_list)
+            assert all(
+                isinstance(fn.original_fn.args[0]._body, LoopBody) for fn in fn_list
+            )
+            loop_bodies = [fn.original_fn.args[0]._body for fn in fn_list]
+        else:
+            assert all(isinstance(fn, functools.partial) for fn in fn_list)
+            assert all(isinstance(fn.args[0]._body, LoopBody) for fn in fn_list)
+            loop_bodies = [fn.args[0]._body for fn in fn_list]
+    assert loop_bodies is not None
+    return loop_bodies
+def _get_dtype_from_loopbodies(loop_bodies):
+    dtypes = OrderedSet[torch.dtype]()
+    for loop_body in loop_bodies:
+        graphs = [loop_body.root_block.graph] + [
+            body.graph for body in list(loop_body.subblocks.values())
+        ]
+        for graph in graphs:
+            for node in graph.nodes:
+                if node.op != "call_method":
+                    continue
+                dtypes.add(node.meta[OptimizationContext.key].dtype)
+    return dtypes
+def template_fusion_with_epilogues_supported(
+    template: BaseSchedulerNode, epilogues: list[BaseSchedulerNode]
+) -> tuple[bool, bool]:
+    def _get_indexes_of_template_buf_read(
+        epilogue_node: ir.Operation, template_buf_names: list[str]
+    ) -> list[sympy.Expr]:
+        return [
+            read.index
+            for read in epilogue_node.get_reads()
+            if read.name in template_buf_names
+        ]
+    def _check_supported_and_same_indexes(
+        index_of_template_buf_read: Sequence[sympy.Expr],
+        epilogue_writes: OrderedSet[Dep],
+    ) -> tuple[bool, bool]:
+        num_indexes = len(OrderedSet(index_of_template_buf_read))
+        if num_indexes > 1:
+            same_index = False
+            supported = False  # Different read indexes not supported
+        elif num_indexes == 0:
+            same_index = True
+            supported = True  # No reads, automatically supported
+        elif num_indexes == 1:
+            iotbr = index_of_template_buf_read[0]
+            same_index = all(write.index == iotbr for write in epilogue_writes)
+            # TODO: Add support of fusion when the read of template buffer and the write of epilogue output
+            # in the epilogue node don't have the same index and change supported to True
+            supported = same_index
+        else:
+            raise AssertionError("Should not reach here")
+        return supported, same_index
+    def _template_fusion_supported(
+        template_outputs: Sequence[SchedulerBuffer], epilogue_nodes: list[ir.Operation]
+    ) -> tuple[bool, bool]:
+        template_buf_names = [x.get_name() for x in template_outputs]
+        indexes_of_template_buf_reads = [
+            _get_indexes_of_template_buf_read(epilogue_node, template_buf_names)
+            for epilogue_node in epilogue_nodes
+        ]
+        epilogue_nodes_writes = [
+            epilogue_node.get_read_writes().writes for epilogue_node in epilogue_nodes
+        ]
+        results = [
+            _check_supported_and_same_indexes(reads, writes)
+            for reads, writes in zip(
+                indexes_of_template_buf_reads, epilogue_nodes_writes
+            )
+        ]
+        supported, same_indexes = zip(*results)
+        return all(supported), all(same_indexes)
+    assert template.is_template()
+    template_outputs = template.get_outputs()
+    epilogue_nodes = [
+        n.node
+        for epilogue in epilogues
+        for n in epilogue.get_nodes()
+        if n.node is not None
+    ]
+    return _template_fusion_supported(template_outputs, epilogue_nodes)

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_wrapper_cpu.py ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py ADDED Viewed

	@@ -0,0 +1,878 @@

+# mypy: allow-untyped-defs
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, Union
+import sympy
+import torch
+import torch._inductor.async_compile  # noqa: F401 required to warm up AsyncCompile pools
+import torch._ops
+from .. import config, ir
+from ..utils import sympy_product
+from ..virtualized import V
+from .cpp_utils import DTYPE_TO_CPP
+from .cpp_wrapper_cpu import CppWrapperCpu
+from .wrapper import (
+    BufferLike,
+    EnterSubgraphLine,
+    ExitSubgraphLine,
+    MemoryPlanningLine,
+    MemoryPlanningState,
+    PythonWrapperCodegen,
+)
+BufferName = str
+# Default thread stack sizes vary by platform:
+# - Linux: 8 MB
+# - macOS: 512 KB
+# - Windows: 1 MB
+# Just pick something comfortably smaller than the smallest for now.
+MAX_STACK_ALLOCATION_SIZE = 1024 * 100
+class CppWrapperCpuArrayRef(CppWrapperCpu):
+    """
+    Generates cpp wrapper for running on CPU and calls cpp kernels
+    This class is forked from CppWrapperCpu, with a difference that tensors may be
+    represented as ArrayRef, see torch/csrc/inductor/aoti_runtime/arrayref_tensor.h
+    """
+    def __init__(self):
+        super().__init__()
+        assert self.device == "cpu", "ArrayRefTensor only supported on CPU!"
+        self.allow_stack_allocation = config.aot_inductor.allow_stack_allocation
+        self.stack_allocated_buffers: dict[BufferName, BufferLike] = {}
+    @staticmethod
+    def create(
+        is_subgraph: bool,
+        subgraph_name: Optional[str],
+        parent_wrapper: Optional[PythonWrapperCodegen],
+        partition_signatures: Optional[ir.GraphPartitionSignature] = None,
+    ):
+        # TODO - support subgraph codegen by lifting functions. Check the
+        # comment at CppWrapperCpu `codegen_subgraph` function.
+        return CppWrapperCpuArrayRef()
+    @staticmethod
+    def get_input_cpp_type(input):
+        assert config.aot_inductor.use_minimal_arrayref_interface
+        if isinstance(input, sympy.Expr):
+            from ..graph import may_get_constant_buffer_dtype
+            dtype = may_get_constant_buffer_dtype(input)
+            assert dtype is not None, f"Failed to get the dtype of sympy.Expr: {input}"
+            return DTYPE_TO_CPP[dtype]
+        return f"ArrayRefTensor<{DTYPE_TO_CPP[input.get_dtype()]}>"
+    @staticmethod
+    def get_device_include_path(device: str) -> str:
+        assert device == "cpu", "ArrayRef only supported on CPU!"
+        if V.graph.aot_mode:
+            return "#include <torch/csrc/inductor/aoti_include/array_ref.h>"
+        return "#include <torch/csrc/inductor/cpp_wrapper/array_ref.h>"
+    def codegen_input_numel_asserts(self):
+        for name, buf in V.graph.graph_inputs.items():
+            if isinstance(buf, sympy.Expr):
+                continue
+            # comparing strides for 0 size tensor is tricky. Ignore them for now.
+            if sympy_product(buf.get_size()) == 0:
+                continue
+            numel = buf.get_numel()
+            self.prefix.writeline(f"assert_numel({name}, {numel});")
+    def generate_extern_kernel_alloc(self, *args, **kwargs):
+        # Disable stack allocation for extern kernels.
+        self.allow_stack_allocation = False
+        super().generate_extern_kernel_alloc(*args, **kwargs)
+    def generate_extern_kernel_out(self, *args, **kwargs):
+        # Disable stack allocation for extern kernels.
+        self.allow_stack_allocation = False
+        super().generate_extern_kernel_out(*args, **kwargs)
+    def generate_fallback_kernel(self, node: ir.FallbackKernel) -> None:
+        # Disable stack allocation for extern kernels.
+        self.allow_stack_allocation = False
+        super().generate_fallback_kernel(node)
+    def _generate_kernel_call_helper(
+        self,
+        kernel_name: str,
+        call_args,
+        *,
+        device=None,
+        triton=True,
+        arg_types=None,
+        raw_keys=None,
+        raw_args=None,
+        triton_meta=None,
+        graph_name="",
+        original_fxnode_name=None,
+    ):
+        """
+        Generates kernel call code.
+        triton: Defines whether the GPU backend uses Triton for codegen.
+                Otherwise it uses the CUDA language for codegen.
+                Only valid when cuda == True.
+        """
+        assert not triton, (
+            "CppWrapperCpuArrayRef.generate_kernel_call does not support GPU"
+        )
+        assert arg_types is not None and len(call_args) == len(arg_types), (
+            "Mismatch call_args and arg_types in generate_kernel_call"
+        )
+        new_args = []
+        for idx, arg in enumerate(call_args):
+            if "*" in arg_types[idx]:
+                var_name = f"var_{next(self.arg_var_id)}"
+                self.writeline(f"auto* {var_name} = get_data_ptr_wrapper({arg});")
+                new_args.append(f"({arg_types[idx]})({var_name})")
+            else:
+                # arg is a scalar
+                new_args.append(arg)
+        # debug printer related logic for cpp kernel type.
+        debug_printer_manager = V.graph.wrapper_code.debug_printer
+        debug_printer_manager.set_printer_args(
+            call_args,
+            kernel_name,
+            None,
+            None,
+            "cpp",
+        )
+        with debug_printer_manager:
+            self.writeline(self.wrap_kernel_call(kernel_name, new_args))
+    def write_wrapper_decl(self):
+        inputs_len = len(V.graph.graph_inputs.keys())
+        if V.graph.aot_mode:
+            if (
+                config.aot_inductor.use_minimal_arrayref_interface
+                and not V.graph.is_const_graph
+            ):
+                input_cpp_types = ", ".join(
+                    f"{CppWrapperCpuArrayRef.get_input_cpp_type(x)}"
+                    for x in V.graph.graph_inputs.values()
+                )
+                output_arrayref_types = ", ".join(
+                    f"ArrayRefTensor<{DTYPE_TO_CPP[x.get_dtype()]}>"
+                    for x in V.graph.graph_outputs
+                )
+                self.prefix.splice(
+                    f"""
+                    using AOTInductorModelInputs = std::tuple<{input_cpp_types}>;
+                    using AOTInductorModelOutputs = std::tuple<{output_arrayref_types}>;
+                    """
+                )
+            if V.graph.const_module:
+                self.header.splice(V.graph.const_module.wrapper_code.header)
+                assert V.graph.const_wrapper_code is not None
+                self.prefix.splice(V.graph.const_wrapper_code)
+                assert V.graph.const_kernel_code is not None
+                self.kernel_declarations.splice(V.graph.const_kernel_code)
+            if V.graph.is_const_graph:
+                self.prefix.splice(
+                    """
+                    void AOTInductorModel::_const_run_impl(
+                        std::vector<AtenTensorHandle>& output_handles,
+                        DeviceStreamType stream,
+                        AOTIProxyExecutorHandle proxy_executor
+                    ) {
+                    """
+                )
+            else:
+                if not config.aot_inductor.use_runtime_constant_folding:
+                    # If we do not split the constant graph, we'll just create
+                    # an empty implementation when wrapping the main module.
+                    self.prefix.splice(
+                        """
+                        void AOTInductorModel::_const_run_impl(
+                            std::vector<AtenTensorHandle>& output_handles,
+                            DeviceStreamType stream,
+                            AOTIProxyExecutorHandle proxy_executor
+                        ) {}
+                        """
+                    )
+                run_impl_proto = """
+                    void AOTInductorModel::run_impl(
+                        AtenTensorHandle*
+                            input_handles, // array of input AtenTensorHandle; handles
+                                            // are stolen; the array itself is borrowed
+                        AtenTensorHandle*
+                            output_handles, // array for writing output AtenTensorHandle; handles
+                                            // will be stolen by the caller; the array itself is
+                                            // borrowed
+                        DeviceStreamType stream,
+                        AOTIProxyExecutorHandle proxy_executor
+                    ) {
+                    """
+                self.generate_input_output_runtime_checks()
+                run_impl_proto += """
+                    __check_inputs_outputs(input_handles, output_handles);
+                """
+                if config.aot_inductor.use_minimal_arrayref_interface:
+                    self.prefix.splice(
+                        """
+                        template <>
+                        AOTInductorModelOutputs AOTInductorModel::run_impl_minimal_arrayref_interface<
+                          AOTInductorModelInputs, AOTInductorModelOutputs>(
+                            const AOTInductorModelInputs& inputs,
+                            DeviceStreamType stream,
+                            AOTIProxyExecutorHandle proxy_executor
+                        ) {
+                        """
+                    )
+                    self.suffix.splice(run_impl_proto)
+                    self.suffix.splice(
+                        """
+                            AOTInductorModelInputs inputs;
+                            convert_handles_to_inputs(input_handles, inputs);
+                            auto outputs = run_impl_minimal_arrayref_interface<AOTInductorModelInputs, AOTInductorModelOutputs>(
+                                inputs, stream, proxy_executor);
+                            // NOTE: outputs is full of ArrayRef to thread_local storage. If in the future we need this
+                            // interface to perform well for a DSO using the minimal arrayref interface, all we need
+                            // to do is provide ThreadLocalCachedTensor for each one!
+                            convert_outputs_to_handles(outputs, output_handles);
+                        }
+                    """
+                    )
+                    self.suffix.splice(
+                        """
+                        extern "C" AOTIRuntimeError AOTInductorModelRunMinimalArrayrefInterface(
+                            AOTInductorModelHandle model_handle,
+                            const AOTInductorModelInputs& inputs,
+                            AOTInductorModelOutputs& outputs) {
+                          auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
+                          CONVERT_EXCEPTION_TO_ERROR_CODE({
+                              outputs = model->run_impl_minimal_arrayref_interface<AOTInductorModelInputs, AOTInductorModelOutputs>(
+                                  inputs,
+                                  (torch::aot_inductor::DeviceStreamType)nullptr,
+                                  nullptr);
+                          })
+                        }
+                    """
+                    )
+                else:
+                    self.prefix.splice(run_impl_proto)
+        else:
+            # cpp entry function for JIT with cpp wrapper
+            self.prefix.splice(
+                """
+                void inductor_entry_impl(
+                    AtenTensorHandle*
+                        input_handles, // array of input AtenTensorHandle; handles
+                                        // are stolen; the array itself is borrowed
+                    AtenTensorHandle*
+                        output_handles  // array for writing output AtenTensorHandle; handles
+                                        // will be stolen by the caller; the array itself is
+                                        // borrowed)
+                ) {
+                """
+            )
+        with self.prefix.indent():
+            # assign inputs and outputs in both cases so the later codegen can be simplified
+            if not config.aot_inductor.use_minimal_arrayref_interface:
+                if not V.graph.is_const_graph:
+                    if V.graph.aot_mode:
+                        num_args = len(V.graph.graph_inputs)
+                    else:
+                        # Weights are promoted in the JIT mode
+                        num_args = len(V.graph.graph_inputs) + len(V.graph.constants)
+                        # release GIL to support multiple instances inference (in different threads of the same process)
+                        self.prefix.splice("py::gil_scoped_release release;")
+                    self.prefix.splice(
+                        f"""
+                            auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, {num_args});
+                        """
+                    )
+            if inputs_len != 0:
+                for idx, input_key in enumerate(V.graph.graph_inputs.keys()):
+                    if config.aot_inductor.use_minimal_arrayref_interface:
+                        self.prefix.writeline(
+                            f"auto {input_key} = std::get<{idx}>(inputs);"
+                        )
+                        continue
+                    # unwrap input tensor back to scalar
+                    if isinstance(V.graph.graph_inputs[input_key], sympy.Expr):
+                        from ..graph import may_get_constant_buffer_dtype
+                        dtype = may_get_constant_buffer_dtype(
+                            V.graph.graph_inputs[input_key]  # type: ignore[arg-type]
+                        )
+                        assert dtype is not None, (
+                            "Fails to get the dtype of the sympy.Expr"
+                        )
+                        self.codegen_tensor_item(
+                            dtype, f"inputs[{idx}]", input_key, self.prefix
+                        )
+                    else:
+                        self.prefix.writeline(
+                            f"auto {input_key} = std::move(inputs[{idx}]);"
+                        )
+            assert all(
+                isinstance(v, torch.Tensor) for v in list(V.graph.constants.values())
+            ), "Expect all constants to be Tensor"
+            for idx, constants_key in enumerate(V.graph.constants.keys()):
+                if V.graph.aot_mode:
+                    # Weights are stored in constants_ and owned by RAIIAtenTensorHandle there.
+                    # Don't call std::move here because it will cause constants_ to lose the ownership.
+                    self.prefix.writeline(
+                        f"""auto {constants_key} = constants_->at({idx});"""
+                    )
+                else:
+                    # Append constants as inputs to the graph
+                    constants_idx = inputs_len + idx
+                    self.prefix.writeline(
+                        f"auto {constants_key} = std::move(inputs[{constants_idx}]);"
+                    )
+            self.codegen_inputs()
+            if V.graph.aot_mode:
+                if not V.graph.is_const_graph:
+                    if config.aot_inductor.use_minimal_arrayref_interface:
+                        # TODO: input shape checking for regular tensor interface as well?
+                        self.codegen_input_numel_asserts()
+                    else:
+                        self.prefix.writeline("inputs.clear();")
+                self.prefix.writeline(
+                    "[[maybe_unused]] auto& kernels = static_cast<AOTInductorModelKernels&>(*this->kernels_.get());"
+                )
+    def generate_return(self, output_refs: list[str]):
+        cst_names = V.graph.constants.keys()
+        arr_iface = (
+            not V.graph.is_const_graph
+            and config.aot_inductor.use_minimal_arrayref_interface
+        )  # For brevity.
+        def use_thread_local_cached_output_tensor(idx, output):
+            cached_output_name = f"cached_output_{next(self.cached_output_id)}"
+            cache_type = "Array" if arr_iface else "Tensor"
+            self.wrapper_call.writeline(
+                f"thread_local ThreadLocalCachedOutput{cache_type}<std::decay_t<decltype({output})>> "
+                f"{cached_output_name}({output});"
+            )
+            if arr_iface:
+                self.wrapper_call.writeline(
+                    f"{cached_output_name}.copy_data_from({output});"
+                )
+                output_entry = f"std::get<{idx}>(output_arrayref_tensors)"
+                element_type = f"std::decay_t<decltype({output_entry}.data()[0])>"
+                self.wrapper_call.writeline(
+                    f"{output_entry} = {cached_output_name}.arrayref_tensor<{element_type}>();"
+                )
+            else:
+                self.wrapper_call.writeline(
+                    f"{cached_output_name}.copy_data_from({output});"
+                )
+                self.wrapper_call.writeline(
+                    f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_new_uninitialized_tensor(&output_handles[{idx}]));"
+                )
+                self.wrapper_call.writeline(
+                    f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_assign_tensors({cached_output_name}.tensor(), "
+                    f"output_handles[{idx}]));"
+                )
+        if arr_iface:
+            self.wrapper_call.writeline(
+                "AOTInductorModelOutputs output_arrayref_tensors;"
+            )
+        output2idx: dict[str, int] = {}
+        for idx, output in enumerate(output_refs):
+            if output == "nullptr":
+                continue
+            is_constant_buffer = output in cst_names
+            output_buffer = V.graph.graph_outputs[idx]
+            if isinstance(output_buffer, ir.BaseView):
+                output_storage = output_buffer.unwrap_view()
+                if isinstance(output_storage.data, ir.ConstantBuffer):
+                    is_constant_buffer = True
+            if isinstance(output_buffer, ir.ShapeAsConstantBuffer):
+                # Need to wrap scalar into tensor as the main function returns a vector of tensors
+                output_tensor = self.codegen_scalar_to_tensor(output)
+                self.wrapper_call.writeline(
+                    f"output_handles[{idx}] = {output_tensor}.release();"
+                )
+                continue
+            output_is_tensor_handle_expr = (
+                f"std::is_same_v<std::decay_t<decltype({output})>,"
+                "RAIIAtenTensorHandle> || "
+                f"std::is_same_v<std::decay_t<decltype({output})>,"
+                "AtenTensorHandle> || "
+                f"std::is_same_v<std::decay_t<decltype({output})>,"
+                "ConstantHandle>"
+            )
+            self.wrapper_call.writeline(
+                f"if constexpr ({output_is_tensor_handle_expr}) {{"
+            )
+            with self.wrapper_call.indent():
+                if arr_iface:
+                    cached_output_name = f"cached_output_{next(self.cached_output_id)}"
+                    self.wrapper_call.writeline(
+                        f"thread_local RAIIAtenTensorHandle {cached_output_name};"
+                    )
+                    if is_constant_buffer:
+                        # NOTE(return_constant): In some rare cases where we return
+                        # a constant, we have to return a copy of this constant,
+                        # because (1) constants are not owned by the Model instance
+                        # (2) constants remain the same cross inference runs,
+                        # assuming they are not updated at runtime Basically, we
+                        # cannot release or transfer the ownership of any original
+                        # constant to the user.
+                        self.wrapper_call.writeline(
+                            f"AtenTensorHandle {cached_output_name}_tmp;"
+                        )
+                        self.wrapper_call.writeline(
+                            f"aoti_torch_clone({output}, &{cached_output_name}_tmp);"
+                        )
+                        self.wrapper_call.writeline(
+                            f"{cached_output_name} = {cached_output_name}_tmp;"
+                        )
+                    else:
+                        self.wrapper_call.writeline(
+                            f"{cached_output_name} = {output}.release();"
+                        )
+                    self.wrapper_call.writeline(
+                        f"convert_handle_to_arrayref_tensor({cached_output_name}, "
+                        f"std::get<{idx}>(output_arrayref_tensors));"
+                    )
+                else:
+                    if is_constant_buffer:
+                        # See NOTE(return_constant) above.
+                        self.wrapper_call.writeline(
+                            f"aoti_torch_clone({output}, &output_handles[{idx}]);"
+                        )
+                    else:
+                        if output in output2idx:
+                            src_idx = output2idx[output]
+                            self.wrapper_call.writeline(
+                                f"output_handles[{idx}] = output_handles[{src_idx}];"
+                            )
+                        else:
+                            self.wrapper_call.writeline(
+                                f"output_handles[{idx}] = {output}.release();"
+                            )
+            self.wrapper_call.writeline("} else {")
+            with self.wrapper_call.indent():
+                use_thread_local_cached_output_tensor(idx, output)
+            self.wrapper_call.writeline("}")
+            if output not in output2idx:
+                output2idx[output] = idx
+        if arr_iface:
+            self.wrapper_call.writeline("return output_arrayref_tensors;")
+    def memory_plan(self):
+        from .memory_planning import MemoryPlanner
+        self.lines = MemoryPlanner(self).plan(self.lines)
+        # TODO: integrate memory planning & stack allocation?
+        self.allow_stack_allocation = False
+    def memory_plan_reuse(self):
+        out_names = V.graph.get_output_names()
+        while (
+            self.lines
+            and isinstance(self.lines[-1], MemoryPlanningLine)
+            # TODO: this seems legit, NullLine has no node
+            and self.lines[-1].node.name not in out_names  # type: ignore[attr-defined]
+        ):
+            # these lines will be pointless
+            self.lines.pop()
+        # codegen allocations in two passes
+        planning_states = [MemoryPlanningState()]
+        past_planning_states = []
+        for i in range(len(self.lines)):
+            line = self.lines[i]
+            if isinstance(line, MemoryPlanningLine):
+                self.lines[i] = line.plan(planning_states[-1])
+            elif isinstance(line, EnterSubgraphLine):
+                planning_states.append(MemoryPlanningState())
+            elif isinstance(line, ExitSubgraphLine):
+                past_planning_states.append(planning_states.pop())
+        past_planning_states.append(planning_states.pop())
+        assert len(planning_states) == 0
+        # conservatively use the sum of all allocated buffer sizes
+        # in potentially nested scopes as the total allocated size
+        total_allocated_buffer_size = sum(
+            s.total_allocated_buffer_size for s in past_planning_states
+        )
+        self.allow_stack_allocation = (
+            self.allow_stack_allocation is not False
+            and config.aot_inductor.allow_stack_allocation
+            and total_allocated_buffer_size <= MAX_STACK_ALLOCATION_SIZE
+        )
+    def can_stack_allocate_buffer(self, buffer):
+        return (
+            self.allow_stack_allocation
+            and buffer.get_device().type == "cpu"
+            and self.can_prove_buffer_has_static_shape(buffer)
+            and ir.is_contiguous_strides_for_shape(
+                buffer.get_stride(), buffer.get_size()
+            )
+        )
+    def make_buffer_free(self, buffer):
+        return (
+            ""
+            if isinstance(buffer.get_output_spec(), ir.MultiOutputLayout)
+            or (V.graph.aot_mode and buffer.get_name() in self.stack_allocated_buffers)
+            or (
+                config.aot_inductor.use_minimal_arrayref_interface
+                and V.graph.aot_mode
+                and buffer.get_name() in V.graph.graph_inputs
+            )
+            else f"{buffer.get_name()}.reset();"
+        )
+    def make_buffer_allocation(self, buffer):
+        return self.make_allocation(
+            buffer.get_name(),
+            buffer.get_device(),
+            buffer.get_dtype(),
+            buffer.get_size(),
+            buffer.get_stride(),
+            buffer if self.can_stack_allocate_buffer(buffer) else None,
+        )
+    def make_allocation(
+        self, name, device, dtype, shape, stride, buffer_if_can_stack_allocate=None
+    ):
+        orig_stride = stride
+        device_str = self.codegen_device(device)
+        dtype_code = self.codegen_dtype(dtype)
+        size = self.codegen_shape_tuple(shape)
+        stride = self.codegen_shape_tuple(orig_stride)
+        size_array_var = self.codegen_int_array_var(
+            size,
+            self.wrapper_call.writeline,
+            known_statically=self.is_statically_known_list_of_ints(shape),
+            graph=self.get_codegened_graph(),
+        )
+        stride_array_var = self.codegen_int_array_var(
+            stride,
+            self.wrapper_call.writeline,
+            known_statically=self.is_statically_known_list_of_ints(orig_stride),
+            graph=self.get_codegened_graph(),
+        )
+        device_type, device_id = device_str.split(",")
+        device_idx = "this->device_idx_" if V.graph.aot_mode else device_id
+        if buffer_if_can_stack_allocate is not None:
+            self.stack_allocated_buffers[name] = buffer_if_can_stack_allocate
+            cpp_type = DTYPE_TO_CPP[dtype]
+            numel = buffer_if_can_stack_allocate.get_numel()
+            # Note: we don't zero storage because empty_strided doesn't zero either.
+            self.wrapper_call.writeline(f"{cpp_type} {name}_storage[{numel}];")
+            args = [
+                f"{name}_storage",
+                size_array_var,
+                stride_array_var,
+                device_type,
+                device_idx,
+            ]
+            return f"ArrayRefTensor<{cpp_type}> {name}({', '.join(args)});"
+        args = [
+            str(len(shape)),
+            size_array_var,
+            stride_array_var,
+            dtype_code,
+            device_type,
+            device_idx,
+            f"&{name}_handle",
+        ]
+        self.wrapper_call.writeline(f"AtenTensorHandle {name}_handle;")
+        self.wrapper_call.writeline(
+            f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided({', '.join(args)}));"
+        )
+        return f"RAIIAtenTensorHandle {name}({name}_handle);"
+    def make_buffer_reuse(self, old: BufferLike, new: BufferLike, delete_old: bool):
+        assert old.get_dtype() == new.get_dtype()
+        old_name = old.get_name()
+        new_name = new.get_name()
+        del_line = ";"
+        if old_name not in V.graph.get_output_names() and delete_old:
+            del_line = f"; {self.make_buffer_free(old)}"
+        if old.get_size() == new.get_size() and old.get_stride() == new.get_stride():
+            if old_name in self.stack_allocated_buffers:
+                self.stack_allocated_buffers[new_name] = new
+            return self.codegen_exact_buffer_reuse(old_name, new_name, del_line)
+        reinterpret_view = self.codegen_reinterpret_view(
+            old, new.get_size(), new.get_stride(), 0, self.wrapper_call.writeline
+        )
+        if reinterpret_view in self.stack_allocated_buffers:
+            self.stack_allocated_buffers[new_name] = new
+            # The only way to get into this case is via an exact buffer reuse, since all
+            # other options result in a new tensor handle.
+            return self.codegen_exact_buffer_reuse(old_name, new_name, del_line)
+        return f"{self.declare}{new_name} = {reinterpret_view}{del_line}  // reuse"
+    def _assert_safe_to_use_borrow_arrayref_tensor_as_tensor(self):
+        # Borrowing arguments to shim functions is only safe because we know
+        # that the arguments can't be stack-allocated. Otherwise, to be sure
+        # we can't return a dangling pointer, we need to either 1) be
+        # certain that the shim function cannot return an alias of a
+        # borrowed argument, or 2) be certain that the returned Tensor from
+        # the shim function cannot escape.
+        assert self.is_safe_to_use_borrow_arrayref_tensor_as_tensor(), (
+            "borrowing arguments to shim functions is unsafe with "
+            "stack allocation on! (see comment above this assertion)"
+        )
+    def is_safe_to_use_borrow_arrayref_tensor_as_tensor(self):
+        return not self.allow_stack_allocation and not self.stack_allocated_buffers
+    def generate_c_shim_extern_kernel_call(
+        self, kernel: str, args: list[str], device: str, **_
+    ) -> None:
+        # In the abi_compatible mode, we call fallback aten ops through a C shim layer
+        # Setting self.allow_stack_allocation to False because the exchange between
+        # ArrayRefTensor and at::Tensor is still fragile.
+        self.allow_stack_allocation = False
+        wrapped_args = []
+        for arg in args:
+            # We only really *need* borrow_arrayref_tensor_as_tensor for
+            # ArrayRefTensors. The code flowing into here uses `0` for nullptr, which
+            # borrow_arrayref_tensor_as_tensor would blindly coerce to int, so just
+            # avoid wrapping integers.  Name matching is to find tensor is hacky, but
+            # fixing all the ArrayRefTensor issues is not a priority for now.
+            if isinstance(arg, str) and arg.startswith(
+                ("buf", "arg", "wrap_with_raii_handle_if_needed")
+            ):
+                self._assert_safe_to_use_borrow_arrayref_tensor_as_tensor()
+                arg = f"borrow_arrayref_tensor_as_tensor({arg})"
+            wrapped_args.append(arg)
+        super().generate_c_shim_extern_kernel_call(
+            kernel, wrapped_args, device, debug_args=args
+        )
+    def generate_scatter_fallback(
+        self,
+        output,
+        inputs,
+        cpp_kernel_name,
+        python_kernel_name,
+        src_is_tensor,
+        reduce,
+        kwargs,
+    ):
+        # No stack allocation when there is a fallback op
+        self.allow_stack_allocation = False
+        # call the ABI shim function instead of the ATen one
+        cpp_kernel_name = self.get_c_shim_func_name(cpp_kernel_name, self.device)
+        # TODO: consider remove "_out" and add missing inplace variants to fallback_ops.py
+        cpp_kernel_name = cpp_kernel_name.replace("__", "_") + "_out"
+        self._assert_safe_to_use_borrow_arrayref_tensor_as_tensor()
+        inputs_wrapped = [
+            (f"borrow_arrayref_tensor_as_tensor({x})" if isinstance(x, str) else str(x))
+            for x in inputs
+        ]
+        line = f"{cpp_kernel_name}(borrow_arrayref_tensor_as_tensor({output}), {','.join(inputs_wrapped)}"
+        if python_kernel_name.startswith("aten.scatter_reduce"):
+            line += f", {','.join(kwargs)}"
+        else:
+            if src_is_tensor:
+                if reduce:
+                    line += f", {V.graph.wrapper_code.val_to_arg_str(reduce)}"
+            else:
+                assert reduce is None, (
+                    "Expect reduce to be None for aten.scatter_ with scalar src"
+                )
+        line += ");"
+        self.writeline(line)
+    def generate_index_put_fallback(self, kernel, x, indices, values, accumulate):
+        # No stack allocation when there is a fallback op
+        self.allow_stack_allocation = False
+        self._assert_safe_to_use_borrow_arrayref_tensor_as_tensor()
+        # TODO: update aoti_torch_index_put_out in ir.py to use autogen out version
+        # See the comment in codegen_reinterpret_view about why having something like
+        # RAIIAtenTensorHandle(tmp_tensor_handle_2) in a tmp array can cause the corresponding
+        # tensor prematurely deallocated, thus the temporary array trick here.
+        indices_str = self._generate_temporary_array_pointer(
+            "AtenTensorHandle",
+            [f"borrow_arrayref_tensor_as_tensor({i})" for i in indices],
+        )
+        args = [
+            f"borrow_arrayref_tensor_as_tensor({x})",
+            indices_str,
+            str(len(indices)),
+            f"borrow_arrayref_tensor_as_tensor({values})",
+            accumulate,
+        ]
+        args.insert(
+            0, f"borrow_arrayref_tensor_as_tensor({x})"
+        )  # set x as the output tensor, this fallback mutates x.
+        self.writeline(self.wrap_kernel_call(kernel, args))
+    def generate_fallback_kernel_with_runtime_lookup(
+        self,
+        buf_name: str,
+        python_kernel_name: str,
+        get_args: Callable[[], Sequence[str]],
+        op_overload: Union[torch._ops.OpOverload, torch._ops.HigherOrderOperator],
+        raw_args: Sequence[Any],
+        outputs: Sequence[ir.Buffer],
+    ) -> None:
+        # No stack allocation when there is a fallback op
+        self.allow_stack_allocation = False
+        super().generate_fallback_kernel_with_runtime_lookup(
+            buf_name, python_kernel_name, get_args, op_overload, raw_args, outputs
+        )
+    def codegen_device_copy(self, src, dst, non_blocking: bool):
+        # aoti_torch_tensor_copy_ takes AtenTensorHandle as input,
+        # while stack-allocation results in ArrayRefTensor
+        # so disable stack allocation here
+        self.allow_stack_allocation = False
+        self.writeline(
+            f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_copy_(expensive_copy_to_tensor_if_needed({dst}), {src}, {non_blocking}));"
+        )
+    def codegen_reinterpret_view(
+        self,
+        data,
+        size,
+        stride,
+        offset,
+        writeline: Callable[..., None],
+        dtype=None,
+    ) -> str:
+        """Returns a newly-created, temporary RAII tensor handle containing the
+        reinterpreted tensor data.  Callers of this function are responsible for saving
+        the handle if persistent access is needed."""
+        dim = str(len(size))
+        def create_reinterpret_call() -> str:
+            args = [
+                f"{data.get_name()}",
+                dim,
+                self.codegen_int_array_var(
+                    self.codegen_shape_tuple(size),
+                    writeline,
+                    known_statically=self.is_statically_known_list_of_ints(size),
+                    graph=self.get_codegened_graph(),
+                ),
+                self.codegen_int_array_var(
+                    self.codegen_shape_tuple(stride),
+                    writeline,
+                    known_statically=self.is_statically_known_list_of_ints(stride),
+                    graph=self.get_codegened_graph(),
+                ),
+                offset,
+            ]
+            return f"wrap_with_raii_handle_if_needed(reinterpret_tensor_wrapper({', '.join(args)}))"
+        def create_new_tensor_handle() -> tuple[str, list[str]]:
+            # Calling reset() on ArrayRefTensor does nothing, since the array is
+            # const-allocated on the stack.  Thus, it's safe to return a reference to
+            # the original array.
+            if (name := data.get_name()) in self.stack_allocated_buffers:
+                return name, []
+            tmp_AtenTensorHandle = f"tmp_{name}_{next(self.tmp_tensor_id)}"
+            tmp_call_strs = [
+                f"AtenTensorHandle {tmp_AtenTensorHandle};",
+                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_new_tensor_handle({data.get_name()}, &{tmp_AtenTensorHandle}));",
+            ]
+            return f"RAIIAtenTensorHandle({tmp_AtenTensorHandle})", tmp_call_strs
+        if (
+            size == data.layout.size
+            and stride == data.layout.stride
+            and offset == data.layout.offset
+            and (dtype is None or dtype == data.dtype)
+        ):
+            final_tensor_str, call_strs = create_new_tensor_handle()
+            for line in call_strs:
+                writeline(line)
+            return final_tensor_str
+        return super().codegen_reinterpret_view(
+            data, size, stride, offset, writeline, dtype
+        )
+    def val_to_arg_str(self, val, type_=None) -> str:
+        if (
+            val is not None
+            and isinstance(type_, torch.OptionalType)
+            and isinstance(type_.getElementType(), torch.TensorType)
+        ):
+            # Handle optional tensors as a special case, as in the parent class.
+            base_handle = self.val_to_arg_str(val, torch.TensorType)
+            if config.aot_inductor.use_minimal_arrayref_interface:
+                if self.is_safe_to_use_borrow_arrayref_tensor_as_tensor():
+                    base_handle = f"borrow_arrayref_tensor_as_tensor({base_handle})"
+                else:
+                    base_handle = f"copy_arrayref_tensor_to_tensor({base_handle})"
+            return f"&temporary_reference({base_handle}.get())"
+        return super().val_to_arg_str(val, type_)
+    def codegen_tensor_item(
+        self, dtype: torch.dtype, tensor: str, scalar: str, indented_buffer=None
+    ):
+        dtype_str = str(dtype).split(".")[-1]
+        writer = indented_buffer or self
+        if dtype == torch.float16 or dtype == torch.bfloat16:
+            scalar_tmp = f"{scalar}_tmp"
+            writer.writeline(f"{DTYPE_TO_CPP[dtype]} {scalar_tmp};")
+            # We know that item_ doesn't alias the input, so borrowing should be safe.
+            tensor = f"borrow_arrayref_tensor_as_tensor({tensor})"
+            writer.writeline(
+                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_item_{dtype_str}({tensor}, &{scalar_tmp}));"
+            )
+            writer.writeline(f"float {scalar} = float({scalar_tmp});")
+        else:
+            writer.writeline(f"{DTYPE_TO_CPP[dtype]} {scalar};")
+            # We know that item_ doesn't alias the input, so borrowing should be safe.
+            tensor = f"borrow_arrayref_tensor_as_tensor({tensor})"
+            writer.writeline(
+                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_item_{dtype_str}({tensor}, &{scalar}));"
+            )

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_wrapper_gpu.py ADDED Viewed

	@@ -0,0 +1,717 @@

+# mypy: allow-untyped-defs
+from __future__ import annotations
+import dataclasses
+import re
+from itertools import count, zip_longest
+from typing import Any, Optional, Union
+from typing_extensions import Self
+import sympy
+import torch
+from torch import dtype as torch_dtype
+from torch._inductor.codecache import get_cpp_wrapper_cubin_path_name
+from torch._inductor.runtime.runtime_utils import dynamo_timed
+from .. import config
+from ..codecache import CudaKernelParamCache
+from ..ir import (
+    GraphPartitionSignature,
+    TensorBox,
+    TMADescriptorExperimental,
+    TMADescriptorStable,
+)
+from ..utils import cache_on_self, get_gpu_type, GPU_ALIGN_BYTES, IndentedBuffer
+from ..virtualized import V
+from .aoti_hipify_utils import maybe_hipify_code_wrapper
+from .common import get_device_op_overrides, TritonScratchWorkspace
+from .cpp_utils import cexpr
+from .cpp_wrapper_cpu import CppWrapperCpu
+from .multi_kernel import MultiKernelCall
+from .triton_utils import should_unwrap_unspec_arg
+from .wrapper import PythonWrapperCodegen, SymbolicCallArg
+_cpp_string_literal_escapes = {
+    "\\": "\\\\",
+    '"': '\\"',
+    "\n": "\\n",
+    "\t": "\\t",
+    "\r": "\\r",
+}
+_cpp_string_literal_pattern = re.compile(r'["\\\n\t\r]')
+def cpp_string_literal(s: str) -> str:
+    escaped = _cpp_string_literal_pattern.sub(
+        lambda match: _cpp_string_literal_escapes[match.group(0)], s
+    )
+    return f'"{escaped}"'
+@dataclasses.dataclass
+class DeferredTritonCallWrapper:
+    """
+    When using cpp wrapper, GPU kernel load and launch needs to wait for Triton kernels
+    to be tuned and stored as cubin files, so use a deferred generating the final wrapper around
+    the triton kernel until right before the prefix is written.
+    """
+    wrapper_name: str
+    kernel_name: str
+    kernel_name_to_body: dict[str, str]
+    arg_types: list[Any]
+    def generate(self, wrapper: CppWrapperGpu):
+        """
+        Generate the GPU kernel definition, as well as load and launch code.
+        """
+        prefix = wrapper.prefix
+        if self.kernel_name.startswith("multi_kernel_"):
+            # MultiKernel will select one kernel after running the autotune block
+            self.kernel_name = MultiKernelCall.lookup_choice(self.kernel_name)
+        params = CudaKernelParamCache.get(self.kernel_name)
+        assert params, f"CudaKernelParamCache not populated for {self.kernel_name}"
+        def_args = params["def_args"]
+        arg_types = self.arg_types
+        inductor_meta = params["inductor_meta"]
+        if "extra_launcher_args" in inductor_meta and len(def_args) > len(arg_types):
+            # extra_launcher_args should already be in def_args
+            assert len(def_args) == len(arg_types) - len(
+                inductor_meta["extra_launcher_args"]
+            )
+            arg_types = arg_types + [SymbolicCallArg] * len(
+                inductor_meta["extra_launcher_args"]
+            )
+        if not V.graph.aot_mode:
+            prefix.writeline(
+                maybe_hipify_code_wrapper(
+                    f"static {wrapper.device_codegen.cpp_kernel_type()} {self.kernel_name} = nullptr;"
+                )
+            )
+            kernel_var_name = self.kernel_name
+        else:
+            kernel_var_name = f"kernels_.{self.kernel_name}"
+        # tensors can be RAIIAtenTensorHandle or ConstantHandle, so make them template types
+        template_types = [
+            f"typename {name}_type_"
+            for name, arg_type in zip(def_args, arg_types)
+            if isinstance(arg_type, (torch_dtype, UnwrapUnspecArg))
+        ]
+        if V.graph.aot_mode:
+            template_types.append("typename kernels_type_")
+        if template_types:
+            prefix.writeline(f"template <{', '.join(template_types)}>")
+        prefix.writeline(f"static inline void {self.wrapper_name}(")
+        with prefix.indent():
+            assert len(def_args) == len(arg_types), (def_args, arg_types)
+            for name, arg_type in zip(def_args, arg_types):
+                if isinstance(arg_type, (torch_dtype, UnwrapUnspecArg)):
+                    prefix.writeline(f"const {name}_type_& {name},")
+                elif issubclass(arg_type, (SymbolicCallArg, sympy.Expr, int)):
+                    prefix.writeline(f"int64_t {name},")
+                elif arg_type is float:
+                    prefix.writeline(f"float {name},")
+                elif arg_type is bool:
+                    prefix.writeline(f"bool {name},")
+                else:
+                    raise ValueError(f"Unexpected arg type {arg_type}")
+            prefix.writeline("int32_t device_idx_,")
+            prefix.writeline(
+                maybe_hipify_code_wrapper(
+                    f"{wrapper.device_codegen.cpp_stream_type()} stream_,"
+                )
+            )
+            if V.graph.aot_mode:
+                prefix.writeline("kernels_type_& kernels_,")
+            prefix.writeline(
+                "const std::optional<std::string>& cubin_dir_ = std::nullopt"
+            )
+        prefix.writeline("){")
+        with prefix.indent():
+            if V.graph.aot_mode:
+                # Emit the original Triton kernel for debugging purposes
+                prefix.writeline("/*")
+                prefix.splice(self.kernel_name_to_body[self.kernel_name])
+                prefix.writeline("*/")
+            self.generate_grid(prefix, inductor_meta, params)
+            self.generate_load_kernel(prefix, kernel_var_name, params)
+            self.generate_launch_kernel(prefix, wrapper, kernel_var_name, params)
+        prefix.writeline("}")
+        if not config.aot_inductor.embed_kernel_binary:
+            # Ensure the cubin file is included in the package
+            V.graph.wrapper_code.additional_files.append(
+                params[get_cpp_wrapper_cubin_path_name()]
+            )
+    def generate_grid(
+        self,
+        prefix: IndentedBuffer,
+        inductor_meta: dict[str, Any],
+        params: dict[str, Any],
+    ):
+        from ..runtime.triton_heuristics import GridExpr
+        grid = GridExpr.from_meta(inductor_meta, params["config"], mode="cpp")
+        for line in grid.prefix:
+            prefix.writeline(line)
+        prefix.splice(
+            f"""\
+            uint32_t grid_0 = {grid.x_grid};
+            uint32_t grid_1 = {grid.y_grid};
+            uint32_t grid_2 = {grid.z_grid};
+            """
+        )
+        prefix.writeline("if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;")
+    def generate_load_kernel(self, prefix, kernel_var_name, params):
+        prefix.writeline(f"if ({kernel_var_name} == nullptr) {{")
+        with prefix.indent():
+            embed_kernel_args = [f"__{params['inductor_meta']['kernel_name']}_start"]
+            if torch.xpu.is_available():
+                # XPU needs the end address of the kernel to calculate the size of the kernel binary.
+                embed_kernel_args.append(
+                    f"__{params['inductor_meta']['kernel_name']}_end"
+                )
+            load_kernel_args = (
+                [
+                    *embed_kernel_args,
+                    cpp_string_literal(params["mangled_name"]),
+                    str(params["shared_mem"]),
+                ]
+                if V.graph.aot_mode and config.aot_inductor.embed_kernel_binary
+                else [
+                    cpp_string_literal(params[get_cpp_wrapper_cubin_path_name()]),
+                    cpp_string_literal(params["mangled_name"]),
+                    str(params["shared_mem"]),
+                    "cubin_dir_",
+                ]
+            )
+            prefix.writeline(
+                f"{kernel_var_name} = loadKernel({', '.join(load_kernel_args)}); "
+            )
+        prefix.writeline("}")
+    def generate_launch_kernel(self, prefix, wrapper, kernel_var_name, params):
+        triton_meta = params["triton_meta"]
+        assert len(self.arg_types) == len(params["def_args"]), (
+            self.arg_types,
+            params["def_args"],
+        )
+        arg_type_loookup = dict(zip(params["def_args"], self.arg_types))
+        # difference between Python and C++ wrapper: C++ wrapper strips out equal_to_1 constants
+        call_args = [
+            name for name in params["call_args"] if name not in triton_meta["constants"]
+        ]
+        arg_types = [arg_type_loookup[name] for name in call_args]
+        arg_signatures = [triton_meta["signature"][name] for name in call_args]
+        call_args_str = wrapper.generate_args_decl(
+            prefix,
+            call_args,
+            arg_types,
+            arg_signatures,
+            workspace_size=params.get("global_scratch") or 0,
+        )
+        prefix.writeline(f"void* kernel_args_[] = {{{call_args_str}}};")
+        launch_kernel_args = [
+            kernel_var_name,
+            "grid_0",
+            "grid_1",
+            "grid_2",
+            str(params["num_warps"]),
+            str(params["shared_mem"]),
+            "kernel_args_",
+            "stream_",
+        ]
+        prefix.writeline(f"launchKernel({', '.join(launch_kernel_args)});")
+class CppWrapperGpu(CppWrapperCpu):
+    """
+    Generates cpp wrapper for running on GPU and calls CUDA kernels
+    """
+    def __init__(self) -> None:
+        self.device = get_gpu_type()
+        self.device_codegen = get_device_op_overrides(self.device)
+        super().__init__()
+        self.grid_id = count()
+        self._kernel_name_to_body: dict[str, str] = {}
+        self._triton_call_wrappers: dict[str, DeferredTritonCallWrapper] = {}
+        self.autotune_input_prefix = "_REAL_AUTOTUNE_INPUT"
+    @staticmethod
+    def create(
+        is_subgraph: bool,
+        subgraph_name: Optional[str],
+        parent_wrapper: Optional[PythonWrapperCodegen],
+        partition_signatures: Optional[GraphPartitionSignature] = None,
+    ):
+        # TODO - support subgraph codegen by lifting functions. Check the
+        # comment at CppWrapperCpu `codegen_subgraph` function.
+        return CppWrapperGpu()
+    def write_header(self):
+        if V.graph.is_const_graph:
+            # We do not write header for constant graph, it will be written by main module.
+            return
+        super().write_header()
+        self.header.splice(
+            maybe_hipify_code_wrapper(self.device_codegen.kernel_driver())
+        )
+    @cache_on_self
+    def write_tma_descriptor_helpers_once(self):
+        self.header.splice(self.device_codegen.tma_descriptor_helpers())
+    def write_get_raw_stream(self, device_idx: int, graph_name: str) -> str:
+        name = f"stream{device_idx}"
+        self.writeline(
+            maybe_hipify_code_wrapper(
+                f"{self.device_codegen.cpp_stream_type()} {name};"
+            )
+        )
+        self.writeline(
+            f"AOTI_TORCH_ERROR_CODE_CHECK({self.device_codegen.aoti_get_stream()}({device_idx}, (void**)&{name}));"
+        )
+        return name
+    def get_autotuning_input_name(self, idx):
+        return f"{self.autotune_input_prefix}_{idx}"
+    def codegen_inputs(self):
+        # See Note: [Input Alignment handling in Inductor]
+        #
+        # JIT Inductor does not guard on input alignment. It relies on copy_misaligned_inputs to
+        # copy misaligned inputs to aligned buffers. For AOTInductor, we need to do the same in cpp.
+        if config.is_fbcode():
+            # TODO: This is added because FC. Remove this once the newly added shim symbols,
+            # e.g. aoti_torch_clone_preserve_strides, have landed
+            return super().codegen_inputs()
+        if V.graph.aot_mode and V.graph.inputs_to_check:
+            for idx in V.graph.inputs_to_check:
+                input_name = V.graph.graph_input_names[idx]
+                assert input_name in V.graph.graph_inputs, (
+                    f"{input_name} not found in graph inputs"
+                )
+                value = V.graph.graph_inputs[input_name]
+                assert isinstance(value, TensorBox), (
+                    f"{input_name} is expected to be tensor but found as {type(value)}"
+                )
+                warn_msg = (
+                    f"Input {idx} was compiled as {GPU_ALIGN_BYTES}-bytes aligned, "
+                    "but it is not aligned at run time. Copying to an aligned tensor "
+                    "to guarantee correctness, but expect a performance hit."
+                )
+                self.prefix.splice(
+                    f"""
+                    if ((long({input_name}.data_ptr()) & ({GPU_ALIGN_BYTES} -1)) != 0) {{
+                        AOTI_TORCH_WARN("{warn_msg}");
+                        AtenTensorHandle {input_name}_aligned;
+                        aoti_torch_clone_preserve_strides({input_name}, &{input_name}_aligned);
+                        {input_name} = std::move(RAIIAtenTensorHandle({input_name}_aligned));
+                    }}
+                    """
+                )
+        super().codegen_inputs()
+    def _define_kernel_helper(
+        self,
+        kernel_name: str,
+        kernel_body: str,
+        metadata: Optional[str] = None,
+        gpu: bool = True,
+        cpp_definition: Optional[str] = None,
+    ):
+        if gpu:
+            self._kernel_name_to_body[kernel_name] = kernel_body
+            if config.triton.autotune_at_compile_time:
+                # Call PythonWrapperCodegen to create the autotune code block
+                PythonWrapperCodegen._define_kernel_helper(
+                    self, kernel_name, kernel_body, metadata, gpu, cpp_definition
+                )
+        else:
+            return CppWrapperCpu._define_kernel_helper(
+                self, kernel_name, kernel_body, metadata, gpu, cpp_definition
+            )
+    def generate(self, is_inference):
+        with dynamo_timed("CppWrapperGpu.generate", log_pt2_compile_event=True):
+            return super().generate(is_inference)
+    def finalize_prefix(self):
+        """Define the triton kernels now that autotuning is finished"""
+        old_prefix = self.prefix  # new content should go at start of prefix
+        # Generating triton kernel callers can modify the prefix (cached dtypes),
+        # so do this before running finalize_prefix(), but put the generated code
+        # after the finalize_prefix() code.
+        self.prefix = IndentedBuffer()
+        for kernel in self._triton_call_wrappers.values():
+            self.prefix.writeline("\n")
+            kernel.generate(self)
+        triton_prefix = self.prefix
+        self.prefix = IndentedBuffer()
+        super().finalize_prefix()
+        self.prefix.splice(triton_prefix)
+        self.prefix.writeline("\n")
+        self.prefix.splice(old_prefix)
+    def generate_tma_descriptor(self, desc):
+        self.write_tma_descriptor_helpers_once()
+        if isinstance(desc, TMADescriptorExperimental):
+            self._generate_experimental_tma_descriptor(desc)
+        else:
+            assert isinstance(desc, TMADescriptorStable)
+            self._generate_stable_tma_descriptor(desc)
+    def _generate_experimental_tma_descriptor(self, desc):
+        # generate data pointer for the source tensor
+        source = self.generate_args_decl(
+            code=self,
+            call_args=[self.val_to_arg_str(desc.tensor)],
+            arg_types=[desc.tensor.get_dtype()],
+            arg_signatures=[None],
+            # these args are passed to initNDTMADescriptor, which is NOT a triton kernel
+            is_triton_kernel=False,
+        )
+        desc_name = desc.name
+        self.writeline(f"alignas(64) CUtensorMap {desc_name};")
+        # `source` is in the form of `&var_x`, where `var_x` is the data pointer
+        # (CUdeviceptr); we dereference `source` and cast to `void*` to pass to
+        # the data pointer of the source tensor to the helper function
+        # `init{1,2}DTMADescriptor`
+        ptr = f"reinterpret_cast<void*>(*({source}))"
+        dims = ", ".join(self.val_to_arg_str(dim) for dim in desc.dims)
+        block_dims = ", ".join(self.val_to_arg_str(dim) for dim in desc.block_dims)
+        element_size = self.val_to_arg_str(desc.element_size)
+        fn = f"init{desc.rank}DTMADescriptor"
+        args = f"&{desc_name}, {ptr}, {dims}, {block_dims}, {element_size}"
+        self.writeline(f"{fn}({args});")
+    def _generate_stable_tma_descriptor(self, desc):
+        source = self.generate_args_decl(
+            code=self,
+            call_args=[self.val_to_arg_str(desc.tensor)],
+            arg_types=[desc.tensor.get_dtype()],
+            arg_signatures=[None],
+            # these args are passed to initNDTMADescriptor, which is NOT a triton kernel
+            is_triton_kernel=False,
+        )
+        desc_name = desc.name
+        # Pack the relevant information into a StableTMADescriptor struct.
+        # See [Note: AOTI TMA Stable handling] for more details.
+        self.writeline(f"alignas(64) StableTMADescriptor {desc_name};")
+        def fill_array(name, values):
+            for i, val in enumerate(values):
+                self.writeline(f"{name}[{i}] = {val};")
+        ptr = f"reinterpret_cast<void*>(*({source}))"
+        rank = len(desc.tensor.get_size())
+        fill_array(f"{desc_name}.block_shape", desc.block_shape)
+        fill_array(f"{desc_name}.global_shape", desc.tensor.get_size())
+        fill_array(f"{desc_name}.strides", desc.tensor.get_stride())
+        element_size = self.val_to_arg_str(desc.tensor.get_dtype().itemsize)
+        fn = "initTMADescriptor"
+        args = ", ".join(
+            str(x)
+            for x in [
+                f"&{desc_name}.m",
+                ptr,
+                element_size,
+                rank,
+                f"{desc_name}.block_shape",
+                f"{desc_name}.global_shape",
+                f"{desc_name}.strides",
+            ]
+        )
+        self.writeline(f"{fn}({args});")
+    def generate_args_decl(
+        self,
+        code: Union[IndentedBuffer, Self],
+        call_args,
+        arg_types,
+        arg_signatures,
+        is_triton_kernel=True,
+        workspace_size=0,
+    ):
+        """
+        Generates any declarations of args to pass into a kernel call, and then returns the arg names.
+        In more detail:
+        * declarations: e.g. this function has a side effect of generating lines like `auto var_0 = ...;`
+        * returns: a string with the list of args, e.g. "var_0, var_1"
+        call_args: list of call arguments
+        arg_types: list of argument types
+        arg_signatures: list with signatures of all the args
+        is_triton_kernel: whether these are passed into a triton kernel or not. In particular,
+                          calls to triton kernels will have an additional global scratch space
+                          arg injected at the front of the arg list.
+        """
+        new_args: list[str] = []
+        # Add more cases for other types as needed
+        signature2dtype = {
+            "i32": "int32_t",
+            "i64": "int64_t",
+            "fp32": "float",
+        }
+        def signature_is_tma_desc(sig):
+            if not sig:
+                return False
+            if sig == "nvTmaDesc":
+                return True
+            if sig.startswith("tensordesc<"):
+                return True
+            return False
+        def process_tma_stable_arg(arg, arg_type, arg_signature, var_name):
+            # [Note: AOTI TMA Stable handling]
+            # For most args, a single arg passed to the python triton interface
+            # maps to a single arg in the cubin interface. However, for host-side
+            # TMA descriptors, a single python arg turns into 1 + 2 * N args in the
+            # cubin interface (where N is the rank).
+            #
+            # To do this: at TMA codegen time (for aoti), we generate a struct
+            # (StableTMADescriptor) containing the necessary information; and then
+            # when we call the function (i.e. here), we unpack the struct members.
+            code.writeline(f"auto {var_name} = {cexpr(arg)};")
+            result = []
+            result.append(f"&{var_name}.m")
+            # from https://github.com/triton-lang/triton/blob/16961b79bdac1b774b42d44e52fd55a266ec2866/third_party/nvidia/backend/driver.py#L111  # noqa: B950
+            match = re.match("tensordesc<([^[>]*)\\[([^]]*)\\]", arg_signature)
+            assert match is not None
+            shape = match.group(2)
+            ndim = shape.count(",") + 1
+            for i in range(ndim):
+                result.append(f"&{var_name}.block_shape[{i}]")
+            for i in range(ndim):
+                result.append(f"&{var_name}.strides[{i}]")
+            return result
+        def process_args(arg, arg_type, arg_signature=None):
+            var_name = f"var_{next(self.arg_var_id)}"
+            # ignore tma descriptors, as host-side TMA descriptors need
+            # to be passed to the compiled Triton kernel by value
+            if isinstance(arg_type, UnwrapUnspecArg) and not signature_is_tma_desc(
+                arg_signature
+            ):
+                self.codegen_tensor_item(
+                    arg_type.dtype,
+                    arg,
+                    var_name,
+                    indented_buffer=code,
+                )
+                new_args.append(f"&{var_name}")
+            elif isinstance(arg_type, torch_dtype) and not signature_is_tma_desc(
+                arg_signature
+            ):
+                device_ptr_type = self.device_codegen.cpp_device_ptr()
+                code.writeline(
+                    maybe_hipify_code_wrapper(
+                        f"{device_ptr_type} {var_name} = reinterpret_cast<{device_ptr_type}>({arg}.data_ptr());"
+                    )
+                )
+                new_args.append(f"&{var_name}")
+            elif arg_type in (sympy.Integer, int):
+                code.writeline(f"int {var_name} = {cexpr(arg)};")
+                new_args.append(f"&{var_name}")
+            elif arg_type in (sympy.Float, float):
+                code.writeline(f"float {var_name} = {cexpr(arg)};")
+                new_args.append(f"&{var_name}")
+            # For symbolic call arguments, examine the arg signatures from triton meta
+            # to explicitly cast to the right type
+            # Reason: `auto` can infer unexpected type against kernel input signature.
+            elif (
+                isinstance(arg_type, type(SymbolicCallArg))
+                and arg_signature is not None
+                and arg_signature in signature2dtype.keys()
+            ):
+                code.writeline(
+                    f"{signature2dtype[arg_signature]} {var_name} = {cexpr(arg)};"
+                )
+                new_args.append(f"&{var_name}")
+            elif arg_signature and arg_signature.startswith("tensordesc<"):
+                new_args.extend(
+                    process_tma_stable_arg(arg, arg_type, arg_signature, var_name)
+                )
+            else:
+                code.writeline(f"auto {var_name} = {cexpr(arg)};")
+                new_args.append(f"&{var_name}")
+        for arg, arg_type, arg_signature in zip_longest(
+            call_args, arg_types, arg_signatures
+        ):
+            process_args(arg, arg_type, arg_signature)
+        if (
+            is_triton_kernel
+            and (
+                global_scratch := self.device_codegen.cpp_global_scratch(
+                    next(self.arg_var_id),
+                    workspace=TritonScratchWorkspace(
+                        size=workspace_size,
+                        generate_dtype_str=(lambda: self.codegen_dtype(torch.uint8)),
+                    ),
+                )
+            )
+            is not None
+        ):
+            global_scratch_def, global_scratch_var = global_scratch
+            code.writelines([maybe_hipify_code_wrapper(x) for x in global_scratch_def])
+            new_args.append(f"&{global_scratch_var}")
+        return ", ".join(new_args)
+    def _generate_kernel_call_helper(
+        self,
+        kernel_name: str,
+        call_args,
+        *,
+        device=None,
+        triton=True,
+        arg_types=None,
+        raw_keys=None,
+        raw_args=None,
+        triton_meta=None,
+        graph_name="",
+        original_fxnode_name=None,
+    ):
+        """
+        Override the default value of argument 'gpu' to True here.
+        generate_kernel_call can still be called with gpu=False because of
+        a mix of cpu kernels and gpu kernels.
+        """
+        device = device or V.graph.get_current_device_or_throw()
+        if device.type == "cpu":
+            # Even in CppWrapperGpu, we may see cpp kernels
+            return CppWrapperCpu._generate_kernel_call_helper(
+                self,
+                kernel_name,
+                call_args,
+                device=device,
+                triton=triton,
+                arg_types=arg_types,
+                raw_keys=raw_keys,
+                raw_args=raw_args,
+                triton_meta=triton_meta,
+            )
+        if (
+            triton
+            and config.triton.autotune_at_compile_time
+            and kernel_name not in self.kernel_autotune_names
+        ):
+            # Call PythonWrapperCodegen to create the autotune code block
+            PythonWrapperCodegen._generate_kernel_call_helper(
+                self,
+                kernel_name,
+                call_args,
+                device=device,
+                triton=triton,
+                arg_types=arg_types,
+                raw_keys=raw_keys,
+                raw_args=raw_args,
+                triton_meta=triton_meta,
+                original_fxnode_name=original_fxnode_name,
+            )
+        stream = (
+            "stream"
+            if V.graph.aot_mode
+            else self.write_get_raw_stream(device.index, graph_name)
+        )
+        if triton:
+            call_args, arg_types = self.prepare_triton_wrapper_args(
+                call_args, arg_types
+            )
+            wrapper_name = f"call_{kernel_name}"
+            if wrapper_name not in self._triton_call_wrappers:
+                self._triton_call_wrappers[wrapper_name] = DeferredTritonCallWrapper(
+                    wrapper_name,
+                    kernel_name,
+                    self._kernel_name_to_body,
+                    arg_types,
+                )
+            device_idx = "this->device_idx_" if V.graph.aot_mode else str(device.index)
+            call_args.append(device_idx)
+            call_args.append(stream)
+            if V.graph.aot_mode:
+                call_args.append("kernels")
+                call_args.append("this->cubin_dir_")
+            debug_printer_manager = V.graph.wrapper_code.debug_printer
+            debug_printer_manager.set_printer_args(
+                call_args[: len(arg_types)], kernel_name, arg_types, None
+            )
+            with debug_printer_manager:
+                self.writeline(f"{wrapper_name}({', '.join(call_args)});")
+        else:
+            casted = []
+            for arg_type, arg in zip(arg_types, call_args):
+                new_arg = arg
+                if arg_type.endswith("*") and arg != "nullptr":
+                    new_arg = f"{arg}.data_ptr()"
+                casted.append(f"({arg_type}){cexpr(new_arg)}")
+            call_args_str = ", ".join(casted)
+            self.writeline(f"kernels.{kernel_name}({call_args_str}, {stream});")
+    @staticmethod
+    def prepare_triton_wrapper_args(
+        call_args: list[Any], arg_types: list[Any]
+    ) -> tuple[list[Any], list[Any]]:
+        assert len(call_args) == len(arg_types), (call_args, arg_types)
+        new_args = []
+        new_args_types = []
+        for arg, arg_type in zip(call_args, arg_types):
+            if isinstance(arg, str):
+                if isinstance(arg_type, torch_dtype) and should_unwrap_unspec_arg(arg):
+                    # dynamo wraps unspec variable as 0d CPU tensor, need convert to scalar
+                    arg_type = UnwrapUnspecArg(dtype=arg_type)
+                new_args.append(arg)
+            elif isinstance(arg, bool):
+                new_args.append(str(arg).lower())
+            elif isinstance(arg, (int, float, SymbolicCallArg)):
+                new_args.append(str(arg))
+            else:
+                new_args.append(cexpr(V.graph.sizevars.simplify(arg)))
+            new_args_types.append(arg_type)
+        return new_args, new_args_types
+    def make_zero_buffer(self, name):
+        return f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_zero_({name}.get()));"
+@dataclasses.dataclass
+class UnwrapUnspecArg:
+    """Marker that we need to call .item() on the tensor"""
+    dtype: torch_dtype

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp_wrapper_mps.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from typing import Any, Optional
+import sympy
+import torch
+from ..ir import GraphPartitionSignature
+from ..virtualized import V
+from .cpp_wrapper_gpu import CppWrapperGpu
+from .wrapper import PythonWrapperCodegen
+class CppWrapperMps(CppWrapperGpu):
+    @staticmethod
+    def create(
+        is_subgraph: bool,
+        subgraph_name: Optional[str],
+        parent_wrapper: Optional[PythonWrapperCodegen],
+        partition_signatures: Optional[GraphPartitionSignature] = None,
+    ) -> "CppWrapperMps":
+        return CppWrapperMps()
+    def _generate_kernel_call_helper(
+        self,
+        kernel_name: str,
+        call_args: list[str],
+        arg_types: Optional[list[type]] = None,
+        **kwargs: dict[str, Any],
+    ) -> None:
+        """
+        Generates MPS kernel call code. It should look something like:
+        ```
+        auto mps_lib_0_func = mps_lib_0.getKernelFunction("generated_kernel");
+        auto mps_lib_0_func_handle = AOTIMetalKernelFunctionHandle(mps_lib_0_func.get());
+        mps_lib_0_func->runCommandBlock([&] {
+            mps_lib_0_func->startEncoding();
+            aoti_torch_mps_set_arg(mps_lib_0_func_handle, 0, buf0);
+            aoti_torch_mps_set_arg(mps_lib_0_func_handle, 1, arg0_1);
+            ...
+            mps_lib_0_func->dispatch(9);
+        });
+        ```
+        """
+        assert arg_types is not None
+        new_args = []
+        for idx, (arg, arg_type) in enumerate(zip(call_args[:-2], arg_types[:-2])):
+            if isinstance(arg_type, torch.dtype):
+                new_args.append(
+                    f"aoti_torch_mps_set_arg_tensor({kernel_name}_handle, {idx}, {arg});\n"
+                )
+            elif arg_type in (int, sympy.core.symbol.Symbol):
+                new_args.append(
+                    f"aoti_torch_mps_set_arg_int({kernel_name}_handle, {idx}, {arg});\n"
+                )
+            else:
+                raise NotImplementedError(
+                    f"Unsupported arg type {arg_type} for arg {arg} for kernel {kernel_name}"
+                )
+        threads, group_size = call_args[-2], call_args[-1]
+        if threads is None:
+            raise NotImplementedError("No threads or group_size provided")
+        elif group_size is None:
+            new_args.append(f"{kernel_name}->dispatch({threads});\n")
+        else:
+            new_args.append(f"{kernel_name}->dispatch({threads}, {group_size});\n")
+        # debug printer related logic for cpp kernel type.
+        debug_printer_manager = V.graph.wrapper_code.debug_printer
+        debug_printer_manager.set_printer_args(
+            call_args[:-2],
+            kernel_name,
+            None,
+            None,
+            "cpp",
+        )
+        with debug_printer_manager:
+            self.writeline(self.wrap_kernel_call(kernel_name, new_args))
+    def wrap_kernel_call(self, name: str, call_args: list[str]) -> str:
+        lib_name = name[: -len("_func")]
+        calling_args = "        ".join(call_args)
+        return f"""
+    auto {name} = {lib_name}.getKernelFunction("generated_kernel");
+    auto {name}_handle = AOTIMetalKernelFunctionHandle({name}.get());
+    {name}->runCommandBlock([&] {{
+        {name}->startEncoding();
+        {calling_args}
+    }});
+        """
+    @staticmethod
+    def get_device_include_path(device: str) -> str:
+        assert V.graph.aot_mode
+        return (
+            "#include <torch/csrc/inductor/aoti_include/mps.h>\n"
+            "#include <torch/csrc/inductor/aoti_torch/c/shim_mps.h>"
+        )

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpu_device_op_overrides.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from __future__ import annotations
+from textwrap import dedent
+from .common import DeviceOpOverrides, register_device_op_overrides
+class CpuDeviceOpOverrides(DeviceOpOverrides):
+    def import_get_raw_stream_as(self, name: str) -> str:
+        return dedent(
+            """
+            def get_raw_stream(_):
+                return 0
+            """
+        )
+    def set_device(self, device_idx: int) -> str:
+        return "pass"
+    def synchronize(self) -> str:
+        return "pass"
+    def device_guard(self, device_idx: int) -> str:
+        return "pass"
+register_device_op_overrides("cpu", CpuDeviceOpOverrides())

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cuda/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py ADDED Viewed

	@@ -0,0 +1,293 @@

+# mypy: allow-untyped-defs
+import hashlib
+import logging
+from collections.abc import Sequence
+from typing import cast
+from torch._inductor.codegen.cuda.cutlass_python_evt import (
+    CutlassEVTCodegen,
+    MockCutlassHandler,
+)
+from torch._inductor.utils import Placeholder
+from torch.utils._ordered_set import OrderedSet
+from ...._dynamo.utils import counters
+from ... import config
+from ...codecache import code_hash, get_path
+from ...ir import Buffer, ComputedBuffer, CUDATemplateBuffer, Pointwise
+from ...scheduler import (
+    BaseSchedulerNode,
+    BaseScheduling,
+    FusedSchedulerNode,
+    SchedulerNode,
+    WhyNoFuse,
+)
+from ...utils import get_fused_kernel_name, get_kernel_metadata, sympy_product
+from ...virtualized import V
+from ..common import BackendFeature, IndentedBuffer
+log = logging.getLogger(__name__)
+class WhyNoFuseNames(WhyNoFuse):
+    def __init__(self, name1: str, name2: str) -> None:
+        self.name1 = name1
+        self.name2 = name2
+class CUDACPPScheduling(BaseScheduling):
+    """
+    Partial Scheduling implementation for CUDA C++ Kernels.
+    This class is intended to be used in combination with TritonScheduling,
+    and delegated to by CUDACombinedScheduling.
+    It handles fusion decisions and CUDA C++ specific template code generation.
+    """
+    @classmethod
+    def get_backend_features(cls, device) -> OrderedSet[BackendFeature]:
+        return OrderedSet()
+    def group_fn(self, sizes):
+        return tuple(V.graph.sizevars.simplify(sympy_product(s)) for s in sizes)
+    @staticmethod
+    def is_cuda_cpp_template(node: BaseSchedulerNode) -> bool:
+        return isinstance(node, SchedulerNode) and isinstance(
+            node.node, CUDATemplateBuffer
+        )
+    def is_cuda_cpp_fused_template(self, node: BaseSchedulerNode) -> bool:
+        return isinstance(node, FusedSchedulerNode) and self.is_cuda_cpp_template(node)
+    def can_fuse_vertical(
+        self, node1: BaseSchedulerNode, node2: BaseSchedulerNode
+    ) -> bool:
+        if self.is_cuda_cpp_template(node1) and isinstance(node2, BaseSchedulerNode):
+            assert node1.node, "node1.node should not be None"
+            return self._can_fuse_epilogue_impl(
+                cast(CUDATemplateBuffer, node1.node),
+                [],
+                node2,  # type: ignore[arg-type]
+            )
+        elif self.is_cuda_cpp_fused_template(node1) and isinstance(
+            node2, BaseSchedulerNode
+        ):
+            assert node1.node, "node1.node should not be None"
+            assert node2.node, "node2.node should not be None"
+            fnode1 = cast(FusedSchedulerNode, node1)
+            return self._can_fuse_epilogue_impl(
+                fnode1.get_template_node(),  # type: ignore[arg-type]
+                self._unwrap_epilogue_nodes(fnode1),
+                node2,  # type: ignore[arg-type]
+            )
+        return False
+    def define_kernel(self, src_code: str, node_schedule) -> str:
+        wrapper = V.graph.wrapper_code
+        if src_code in wrapper.src_to_kernel:
+            kernel_name = wrapper.src_to_kernel[src_code]
+        else:
+            fused_name = (
+                get_fused_kernel_name(node_schedule, config.triton.descriptive_names)
+                if config.triton.descriptive_names
+                else ""
+            )
+            # use the original src_code as the key
+            kernel_hash = hashlib.sha256(src_code.encode("utf-8")).hexdigest()[:8]
+            if fused_name == "fused":
+                # no EVT kernel, use the original kernel name
+                kernel_name = f"cutlass_{kernel_hash}"
+            else:
+                kernel_name = f"cutlass_{fused_name}_{kernel_hash}"
+            wrapper.src_to_kernel[src_code] = kernel_name
+            src_code = src_code.replace(str(Placeholder.KERNEL_NAME), kernel_name)
+            _, _, kernel_path = get_path(code_hash(src_code), "py")
+            compile_wrapper = IndentedBuffer()
+            compile_wrapper.writeline("async_compile.cuda(r'''")
+            compile_wrapper.splice(src_code, strip=True)
+            compile_wrapper.writeline(
+                f"''', 'so', aot_compile={str(V.graph.aot_mode)})"
+            )
+            metadata_comment = f"# kernel path: {kernel_path}"
+            origins, detailed_origins = get_kernel_metadata(node_schedule, wrapper)
+            metadata_comment += "\n" + origins + "\n" + detailed_origins
+            wrapper.define_kernel(
+                kernel_name, compile_wrapper.getvalue(), metadata_comment
+            )
+        return kernel_name
+    def codegen_template(
+        self,
+        template_node: BaseSchedulerNode,
+        epilogue_nodes: Sequence[BaseSchedulerNode],
+        prologue_nodes: Sequence[BaseSchedulerNode],
+    ):
+        """
+        Codegen a CUDA template, possibly with fused epilogues
+        """
+        counters["inductor"]["cuda_epilogue_fusion_counter"] += len(epilogue_nodes)
+        assert self.is_cuda_cpp_template(template_node), (
+            "Template node passed to CUDAScheduler.codegen_template must be a SchedulerNode that wraps a CUDATemplateBuffer"
+        )
+        template_node = cast(SchedulerNode, template_node)
+        _, (_numel, rnumel) = template_node.group
+        assert rnumel == 1
+        ctb: CUDATemplateBuffer = cast(CUDATemplateBuffer, template_node.node)
+        epilogue_ir_nodes: list[Buffer] = [n.node for n in epilogue_nodes]  # type: ignore[misc]
+        assert all(isinstance(n, ComputedBuffer) for n in epilogue_ir_nodes), (
+            "Epilogue nodes must all be instances of ir.ComputedBuffer"
+        )
+        kernel, render = ctb.make_kernel_render(ctb, epilogue_nodes=epilogue_nodes)
+        with kernel:
+            for node in [template_node, *epilogue_nodes]:
+                node.mark_run()
+            # typically there is a codegen pass which runs after mark_run
+            # for this kernel we've already generated the C++ code, but we still
+            # need to let the kernel know about loads/stores that occur in the fused
+            # kernel for memory planning to properly optimize allocations
+            ctb.emulate_store_fn()
+            for node in epilogue_ir_nodes:
+                with V.set_ops_handler(MockCutlassHandler(V.get_ops_handler())):
+                    assert isinstance(
+                        node, ComputedBuffer
+                    )  # Not sure why we need to do this again
+                    node.get_store_function()(CutlassEVTCodegen.get_index_vars(node))
+        with V.set_kernel_handler(kernel):
+            src_code = render()
+            node_schedule = [template_node, *epilogue_nodes]
+            kernel_name = self.define_kernel(src_code, node_schedule)
+        # debug printing values of intermediate tensors
+        _, call_args, arg_signatures, _ = kernel.args.python_argdefs()
+        debug_printer_manager = V.graph.wrapper_code.debug_printer
+        debug_printer_manager.set_printer_args(
+            call_args, kernel_name, arg_signatures, kernel
+        )
+        with debug_printer_manager:
+            kernel.call_kernel(kernel_name, ctb)
+        V.graph.removed_buffers |= kernel.removed_buffers
+        self.free_buffers_in_scheduler()
+    @staticmethod
+    def _unwrap_epilogue_nodes(
+        fused_node: FusedSchedulerNode,
+    ) -> list[BaseSchedulerNode]:
+        nodes = fused_node.get_nodes()
+        template_node = fused_node.get_template_node()
+        assert all(n.node is not None for n in nodes), (
+            "All epilogue nodes should have an IRNode"
+        )
+        return cast(
+            list[BaseSchedulerNode], [n for n in nodes if n.node is not template_node]
+        )
+    def _can_fuse_epilogue_impl(
+        self,
+        cuda_template_buffer: CUDATemplateBuffer,
+        existing_epilogue_nodes: list[BaseSchedulerNode],
+        node_to_fuse: BaseSchedulerNode,
+    ) -> bool:
+        """
+        Check if the given node can be fused with the epilogue. At the moment, Kernels
+        support fusion with Pointwise operations, wrapped in (named) ComputedBuffer nodes.
+        Args:
+            cuda_template_buffer : A CUDATemplateBuffer object representing the CUDA template and it's result buffer
+            existing_epilogue_nodes : List[SchedulerNode]: The list of already fused epilogue nodes.
+            node_to_fuse: The SchedulerNode node to be checked if it can be fused with the epilogue.
+        Returns:
+        - bool: True if the given node can be fused with the epilogue, False otherwise.
+        """
+        why = WhyNoFuseNames(cuda_template_buffer.get_name(), node_to_fuse.get_name())
+        scheduler_nodes_to_fuse = node_to_fuse.get_nodes()
+        assert isinstance(cuda_template_buffer, CUDATemplateBuffer)
+        # Checks on constituent nodes
+        for s_node in scheduler_nodes_to_fuse:
+            node = s_node.node
+            if not isinstance(node, ComputedBuffer):
+                why(f"{node} is not a ComputedBuffer")
+                return False
+            elif not isinstance(node.data, Pointwise):
+                why(f"{node} is not a Pointwise op")
+                return False
+            elif not node.get_computed_buffer_name():  # type: ignore[attr-defined]
+                why(f"{node} does not have a computed buffer name")
+                return False
+            name = node.get_computed_buffer_name()  # type: ignore[attr-defined]
+            # dtype can differ, and strides can differ as long as they are broadcastable
+            if node.get_size() != cuda_template_buffer.get_size():
+                why(
+                    f"{name}'s size: {node.get_size()} differs from {cuda_template_buffer.get_name()}'s \
+size: {cuda_template_buffer.get_size()}"
+                )
+                return False
+        assert len(
+            existing_epilogue_nodes
+        ) or cuda_template_buffer.get_name() in OrderedSet(
+            [rd.name for rd in node_to_fuse.read_writes.reads]
+        ), "First epilogue node must read from cuda template buffer"
+        if node_to_fuse.has_aliasing_or_mutation():
+            why(f"{node_to_fuse.get_name()} has aliasing or mutation")
+            return False
+        elif node_to_fuse.is_reduction():
+            why(
+                f"{node_to_fuse.get_name()} is a reduction which is not yet supported by EVT"
+            )
+            return False
+        elif (
+            not config.cuda.cutlass_epilogue_fusion_enabled
+            or not config.epilogue_fusion
+        ):
+            why("cutlass epilogue fusion is not enabled")
+            return False
+        elif not cuda_template_buffer.supports_epilogue_fusion:
+            why("epilogue fusion is only supported for TMA-enabled gemm ops")
+            return False
+        try:
+            from torch._inductor.codegen.cuda.cutlass_python_evt import (
+                CutlassEVTCodegen,
+            )
+            CutlassEVTCodegen.ir_to_evt_python_code(
+                cuda_template_buffer.get_name(),
+                existing_epilogue_nodes + list(node_to_fuse.get_nodes()),
+                OrderedSet(),
+            )
+        except NotImplementedError as e:
+            not_implemented_op = str(e)
+            if not_implemented_op.startswith("_op_"):
+                not_implemented_op = not_implemented_op[4:]
+                why(
+                    f"Cannot fuse epilogue node {node_to_fuse} into {cuda_template_buffer.name}, \
+likely due to unsupported operation: {not_implemented_op}"  # noqa: G004, B950
+                )
+                return False
+            else:  # Likely due to unsupported dtype.
+                why(
+                    f"Cannot fuse epilogue node {node_to_fuse} into {cuda_template_buffer.name}. \
+Reason: {not_implemented_op}"  # noqa: G004, B950
+                )
+                return False
+        return True

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cuda/cuda_env.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import functools
+import logging
+import shutil
+from typing import Optional
+import torch
+from torch._inductor.utils import clear_on_fresh_cache
+from ... import config
+log = logging.getLogger(__name__)
+@clear_on_fresh_cache
+@functools.lru_cache(1)
+def get_cuda_arch() -> Optional[str]:
+    try:
+        cuda_arch = config.cuda.arch
+        if cuda_arch is None:
+            # Get Compute Capability of the first Visible device
+            major, minor = torch.cuda.get_device_capability(0)
+            return str(major * 10 + minor)
+        return str(cuda_arch)
+    except Exception as e:
+        log.error("Error getting cuda arch: %s", e)
+        return None
+@clear_on_fresh_cache
+@functools.lru_cache(1)
+def get_cuda_version() -> Optional[str]:
+    try:
+        cuda_version = config.cuda.version
+        if cuda_version is None:
+            cuda_version = torch.version.cuda
+        return cuda_version
+    except Exception as e:
+        log.error("Error getting cuda version: %s", e)
+        return None
+@functools.cache
+def nvcc_exist(nvcc_path: Optional[str] = "nvcc") -> bool:
+    return nvcc_path is not None and shutil.which(nvcc_path) is not None

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cuda/cuda_kernel.py ADDED Viewed

	@@ -0,0 +1,674 @@

+# mypy: allow-untyped-defs
+import functools
+import itertools
+import logging
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Callable, Literal, Optional, TYPE_CHECKING, Union
+from sympy import Expr, symbols
+import torch._inductor.config as config
+from torch import dtype as torch_dtype
+from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu
+from torch._inductor.scheduler import BaseSchedulerNode
+from torch._inductor.utils import do_bench_using_profiling, OrderedSet, Placeholder
+from torch.utils._sympy.value_ranges import ValueRanges
+from .cutlass_utils import DTYPE_TO_CUTLASS_TYPE
+if TYPE_CHECKING:
+    from .cuda_template import ArgInfo
+from ...autotune_process import CUDABenchmarkRequest
+from ...ir import (
+    Buffer,
+    ChoiceCaller,
+    CUDATemplateBuffer,
+    IRNode,
+    Layout,
+    PrimitiveInfoType,
+    TensorBox,
+)
+from ...utils import sympy_product
+from ...virtualized import V
+from ..common import (
+    CSEVariable,
+    IndentedBuffer,
+    Kernel,
+    OpOverrides,
+    WorkspaceArg,
+    WorkspaceZeroMode,
+)
+from ..cpp_utils import CppPrinter, DTYPE_TO_CPP
+if TYPE_CHECKING:
+    from torch._inductor.codegen.cuda.cuda_template import CUDATemplate
+log = logging.getLogger(__name__)
+cexpr = CppPrinter().doprint
+def _normalize_idx(index: int, total_length: int) -> int:
+    return index if index >= 0 else index + total_length
+ValidLayoutSymbols = Literal["M", "N", "K", "B", "lda", "ldb", "ldc", "ldd"]
+ValidLayoutAttrs = Literal["size", "stride"]
+@dataclass(frozen=True)
+class LayoutArg:
+    node: IRNode
+    symbol: ValidLayoutSymbols
+    attr: ValidLayoutAttrs
+    dim: int
+    def matches(self, node, attr, dim) -> bool:
+        return self.node == node and self.attr == attr and self.dim == dim
+class CUDAKernel(Kernel):
+    """
+    Baseclass for CUDA / Cutlass based Kernels
+    """
+    overrides = OpOverrides  # type: ignore[assignment]
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.layout_args: dict[str, list[LayoutArg]] = defaultdict(list)
+        self.size_args: list[Union[Expr, int]] = []
+        # Mapping from arg name to IRNode.
+        self.named_nodes: dict[str, IRNode] = {}
+    def find_symbol(
+        self, node: IRNode, attr: ValidLayoutAttrs, dim: int
+    ) -> Optional[str]:
+        arg = self.find_layout_arg(node, attr, dim)
+        return arg.symbol if arg else None
+    def find_layout_arg(
+        self, node: IRNode, attr: ValidLayoutAttrs, dim: int
+    ) -> Optional[LayoutArg]:
+        matches = [
+            arg
+            for arg in itertools.chain.from_iterable(self.layout_args.values())
+            if arg.matches(node, attr, dim)
+        ]
+        if len(matches) >= 1:
+            # Verify all matches have the same node, attribute, and dimension
+            # And if they come from the same node, whichever symbol we use is fine.
+            # if in runtime the logic changes, this would trigger guard
+            first_match = matches[0]
+            if not all(
+                match.node == first_match.node
+                and match.attr == first_match.attr
+                and match.dim == first_match.dim
+                for match in matches
+            ):
+                raise AssertionError("All matching layout args should be identical")
+            return first_match
+        return None
+    def add_layout_arg(
+        self, symbol: ValidLayoutSymbols, node: IRNode, attr: ValidLayoutAttrs, dim: int
+    ):
+        arg = LayoutArg(node, symbol, attr, dim)
+        self.layout_args[symbol].append(arg)
+    def init_layout_args(self) -> None:
+        X = self.named_nodes["X"]
+        W = self.named_nodes["W"]
+        Y = self.named_nodes["Y"]
+        Bias = self.named_nodes.get("Bias", None)
+        x_mdim = _normalize_idx(-2, len(X.get_size()))
+        x_kdim = _normalize_idx(-1, len(X.get_size()))
+        w_kdim = _normalize_idx(-2, len(W.get_size()))
+        w_ndim = _normalize_idx(-1, len(W.get_size()))
+        y_mdim = _normalize_idx(-2, len(Y.get_size()))
+        y_ndim = _normalize_idx(-1, len(Y.get_size()))
+        self.add_layout_arg("M", X, "size", x_mdim)
+        self.add_layout_arg("K", X, "size", x_kdim)
+        self.add_layout_arg("K", W, "size", w_kdim)
+        self.add_layout_arg("N", W, "size", w_ndim)
+        self.add_layout_arg("M", Y, "size", y_mdim)
+        self.add_layout_arg("N", Y, "size", y_ndim)
+        if len(X.get_size()) > 2:
+            self.add_layout_arg("B", X, "size", 0)
+        lda_dim = self.find_ld_idx(X)
+        ldb_dim = self.find_ld_idx(W)
+        ldc_dim = self.find_ld_idx(Bias) if Bias else None
+        ldd_dim = self.find_ld_idx(Y)
+        self.add_layout_arg("lda", X, "stride", lda_dim)
+        self.add_layout_arg("ldb", W, "stride", ldb_dim)
+        if Bias is not None and ldc_dim is not None:
+            self.add_layout_arg("ldc", Bias, "stride", ldc_dim)
+        self.add_layout_arg("ldd", Y, "stride", ldd_dim)
+    def get_layout_args(self) -> tuple[Union[Expr, int], ...]:
+        X = self.named_nodes["X"]
+        W = self.named_nodes["W"]
+        Y = self.named_nodes["Y"]
+        Bias = self.named_nodes.get("Bias", None)
+        mdim = _normalize_idx(-2, len(X.get_size()))
+        ndim = _normalize_idx(-1, len(W.get_size()))
+        kdim = _normalize_idx(-1, len(X.get_size()))
+        def get_ld(node) -> Union[Expr, int]:
+            dim = self.find_ld_idx(node)
+            return node.get_stride()[dim]
+        M = X.get_size()[mdim]
+        N = W.get_size()[ndim]
+        K = X.get_size()[kdim]
+        B = X.get_size()[0] if len(X.get_size()) > 2 else 1
+        LDA = get_ld(X)
+        LDB = get_ld(W)
+        LDC = get_ld(Bias) if Bias else 0
+        LDD = get_ld(Y)
+        return (M, N, K, B, LDA, LDB, LDC, LDD)
+    def get_dynamic_shape_args(self) -> list[Union[Expr, int]]:
+        return [*self.get_layout_args(), *self.size_args]
+    @staticmethod
+    def find_ld_idx(node: IRNode) -> int:
+        strides = node.get_stride()
+        # Handle 1D tensor case
+        if V.graph.sizevars.statically_known_equals(strides[-1], 1):
+            return _normalize_idx(-2, len(strides))
+        assert V.graph.sizevars.statically_known_equals(strides[-2], 1), strides[-2]
+        return _normalize_idx(-1, len(strides))
+class CUDATemplateKernel(CUDAKernel):
+    """
+    Template kernels defined by CUDA / Cutlass in C++.
+    """
+    _EXTRA_CPP_ARGS = "size_t* workspace_size, uint8_t* workspace, cudaStream_t stream"
+    def __init__(
+        self,
+        kernel_name: str,
+        runtime_arg_info: list["ArgInfo"],
+        runtime_arg_values: list[Any],
+    ) -> None:
+        """
+        Initializes a new instance of the CUDATemplateKernel class.
+        Args:
+            kernel_name (str): The name of the kernel.
+        """
+        super().__init__()
+        self.kernel_name = kernel_name
+        self.runtime_arg_info = runtime_arg_info
+        self.runtime_arg_values = runtime_arg_values
+    def check_not_null(self, node: IRNode) -> str:
+        """
+        Generates code to check that a node is not null.
+        """
+        if node is None:
+            return ""
+        size_str = self.size(node, 0, -1)
+        name_str = self.arg_name(node)
+        if name_str is None:
+            return ""
+        res = IndentedBuffer(initial_indent=2)
+        res.tabwidth = 1
+        res.splice(
+            f"""
+            {{
+              if (!{name_str}) {{
+                int64_t {name_str}_size = {size_str};
+                if ({name_str}_size > 0) {{
+                  throw std::runtime_error("input {name_str} is null but size is not 0!");
+                }}
+              }}
+            }}
+            """
+        )
+        return res.getvalue()
+    def get_signature(self) -> str:
+        return self.signature
+    def def_kernel(
+        self,
+        inputs: list[IRNode],
+        outputs: list[IRNode],
+        names_str: str = "",
+        input_reorder: Optional[list[int]] = None,
+    ) -> str:
+        """
+        Hook called from template code to generate function definition and
+        needed args.
+        Args:
+            inputs: List of input IRNodes
+            outputs: List of output IRNodes
+            names_str: Comma separated list of input + output argument names.
+            input_reorder: The actual order of input nodes.
+                           e.g. The template might have input argument defined as [X, W, Bias],
+                           and the actual input passed into this template could be [Bias, X, W].
+                           In this case, the `input_reorder` would be [2, 0, 1].
+            additional_size_args: Additional size arguments for epilogue inputs
+        """
+        names = [x.strip() for x in names_str.strip().split(",")]
+        if len(inputs) + len(outputs) != len(names):
+            raise RuntimeError(
+                f"{len(inputs) + len(outputs)=} != {len(names)=}, {inputs=}, {outputs=}, {names=}"
+            )
+        if input_reorder is not None:
+            assert len(inputs) == len(input_reorder)
+        else:
+            input_reorder = list(range(len(inputs)))
+        for idx in input_reorder:
+            name = names[idx]
+            node = inputs[idx]
+            if node is not None:
+                self.named_nodes[name] = node
+                self.args.input_buffers[node.get_name()] = name
+        free_symbols: OrderedSet[Expr] = OrderedSet()
+        for name, node in zip(names[len(inputs) : len(inputs) + len(outputs)], outputs):
+            if node is not None:
+                self.named_nodes[name] = node
+                self.args.output_buffers[node.get_name()] = name
+                if name not in (
+                    "X",
+                    "W",
+                    "Bias",
+                    "Y",
+                ):  # we handle these symbolic shapes explicitly
+                    for expr in itertools.chain(node.get_size(), node.get_stride()):
+                        if isinstance(expr, Expr):
+                            for s in expr.free_symbols:
+                                free_symbols.add(s)  # type: ignore[arg-type]
+        arg_defs, *_ = self.args.cpp_argdefs(DTYPE_TO_CUTLASS_TYPE)
+        self.init_layout_args()
+        size_vars = ["M", "N", "K", "B", "lda", "ldb", "ldc", "ldd"]
+        size_vars.extend(str(s) for s in free_symbols)
+        self.size_args.extend(free_symbols)
+        size_args = [f"const int {s}" for s in size_vars]
+        runtime_arg_decls = ",".join(
+            [f"{arg.ty} {arg.name}" for arg in self.runtime_arg_info]
+        )
+        if runtime_arg_decls:
+            runtime_arg_decls += ", "
+        signature = f"int {self.kernel_name}({', '.join(arg_defs + size_args)}, {runtime_arg_decls}{self._EXTRA_CPP_ARGS})"
+        self.signature = signature
+        return signature
+    def call_kernel(
+        self,
+        name: str,
+        node: "CUDATemplateBuffer",  # type: ignore[name-defined]
+    ) -> None:
+        """
+        Generates code to call the kernel through V.graph.wrapper_code.
+        used from within torch._inductor.wrapper.PythonWrapperCodegen
+        name: Name of kernel function.
+        node: The CUDATemplateBuffer node which contains information about the kernel, it's fused epilogue nodes
+        as well as all required inputs and outputs.
+        """
+        wrapper = V.graph.wrapper_code
+        arg_types: list[Any]
+        if V.graph.cpp_wrapper:
+            # Make sure we initialize these kernels since they're exported as
+            # C-style symbol names.
+            assert isinstance(wrapper, CppWrapperCpu)
+            wrapper.initialized_kernels[name] = self
+            # We always originally initialize name with "KERNEL_NAME". So, we
+            # we replace with the real kernel name passed as an arg to this function.
+            self.signature = self.signature.replace(str(Placeholder.KERNEL_NAME), name)
+            _, call_args, arg_types = self.args.cpp_argdefs(DTYPE_TO_CUTLASS_TYPE)
+        else:
+            _, call_args, _, arg_types = self.args.python_argdefs()
+        dynamic_shape_args = self.get_dynamic_shape_args()
+        call_args.extend(dynamic_shape_args)  # type: ignore[arg-type]
+        for arg in self.runtime_arg_values:
+            call_args.append(arg)
+        arg_types.extend("int" for _ in dynamic_shape_args)
+        for arg in self.runtime_arg_info:
+            arg_types.append(arg.ty)
+        # dynamo wraps unspec variable as 0d CPU tensor, need convert to scalar
+        for i in range(len(call_args)):
+            if V.graph.is_unspec_arg(call_args[i]):
+                call_args[i] = call_args[i] + ".item()"
+            elif isinstance(arg_types[i], torch_dtype):
+                call_args[i] = (
+                    call_args[i]
+                    if V.graph.cpp_wrapper
+                    else f"c_void_p({call_args[i]}.data_ptr())"
+                )
+        # workspace_size ptr is NULL to mark this call is not intended for retrieving workspace_size.
+        # workspace_size should have already been retrieved prior to this call.
+        # workspace_size is here.
+        call_args.append("nullptr" if V.graph.cpp_wrapper else "None")
+        if V.graph.cpp_wrapper:
+            arg_types.append("size_t*")
+        if node.get_workspace_size() > 0:
+            ws = WorkspaceArg(
+                count=node.get_workspace_size(),
+                device=V.graph.get_current_device_or_throw(),
+                zero_mode=WorkspaceZeroMode.UNINITIALIZED,
+                outer_name=WorkspaceArg.unique_name(),
+            )
+            wrapper.generate_workspace_allocation(ws)
+            workspace = str(ws.outer_name)
+            call_args.append(
+                workspace
+                if V.graph.cpp_wrapper
+                else f"c_void_p({workspace}.data_ptr())"
+            )
+        else:
+            ws = None
+            call_args.append("nullptr" if V.graph.cpp_wrapper else "None")
+        if V.graph.cpp_wrapper:
+            arg_types.append("uint8_t*")
+        wrapper.generate_kernel_call(
+            name,
+            call_args,
+            triton=False,
+            arg_types=arg_types,
+        )
+        if ws:
+            wrapper.generate_workspace_deallocation(ws)
+    def dtype(self, node: IRNode) -> Optional[str]:
+        """
+        Generates code which represents dtype of a given node.
+        """
+        if node is None:
+            return "void"
+        return DTYPE_TO_CPP.get(node.get_layout().dtype)
+    def cutlass_dtype(self, node: IRNode, default_dtype="void") -> Optional[str]:
+        # Helper method, called into from CUTLASSGemmTemplate
+        if node is None:
+            return default_dtype
+        from torch._inductor.codegen.cuda.cuda_template import CUTLASSTemplate
+        return CUTLASSTemplate._DTYPE_TO_CUTLASS[node.get_layout().dtype]
+    def max_valid_index(self, node: IRNode, default=-1):
+        # Helper method, called into from CUTLASSGemmTemplate
+        if node is None:
+            return default
+        max_valid_offset = 0
+        for i in range(len(node.get_size())):
+            max_valid_offset += (node.get_size()[i] - 1) * node.get_stride()[i]
+        return max_valid_offset
+    def offset(self, node: IRNode) -> str:
+        """
+        Generates code which represents offset of a given node.
+        """
+        if node is None:
+            return "0"
+        return str(node.get_layout().offset)  # type: ignore[union-attr]
+    def ptr(self, node: IRNode) -> str:
+        """
+        Generates code which represents pointer of a given node.
+        """
+        if node is None:
+            return "nullptr"
+        arg_name = self.arg_name(node)
+        if arg_name is None:
+            return "nullptr"
+        offset = self.offset(node)
+        return arg_name if offset == "0" else f"{arg_name} + {offset}"
+    def size(
+        self,
+        node: IRNode,
+        start_index: int,
+        end_index: Optional[int] = None,
+        default_value: int = 0,
+    ) -> str:
+        """
+        Hook called from template code to get the size of an arg.
+        Generates code which represents size of a given node in [start_index, end_index).
+        If node is None, returns default_value.
+        TODO: Will add needed args to pass it in if it is dynamic.
+        """
+        if node is None:
+            return str(default_value)
+        start_index = _normalize_idx(start_index, len(node.get_size()))
+        if end_index is None:
+            end_index = start_index
+        end_index = _normalize_idx(end_index, len(node.get_size()))
+        sizes = [
+            self.find_symbol(node, "size", dim=i) or node.get_size()[i]
+            for i in range(start_index, end_index + 1)
+        ]
+        if len(sizes) == 0:
+            return str(default_value)
+        sizes = [symbols(v) if isinstance(v, str) else v for v in sizes]
+        val = sympy_product(sizes)
+        return val
+    def stride(self, node: IRNode, index: int, default_value: int = 0) -> str:
+        """
+        Hook called from template code to get the stride of an arg.
+        Generates code which represents stride of a given node at index.
+        If node is None, returns default_value.
+        TODO: Will add needed args to pass it in if it is dynamic.
+        """
+        if node is None:
+            return str(default_value)
+        index = _normalize_idx(index, len(node.get_size()))
+        if index < 0:
+            return str(default_value)
+        stride = node.get_stride()[index]
+        if V.graph.sizevars.statically_known_leq(stride, 1):
+            return str(stride)
+        return self.find_symbol(node, "stride", dim=index) or str(stride)
+    def batch_stride(self, node: IRNode, default_value: int = 0) -> str:
+        """
+        Hook called from template code to get the batch stride of an arg.
+        Returns 0 if batch dim is not present.
+        This method assumes that batch stride is the largest stride.
+        """
+        if node is None:
+            return str(default_value)
+        if len(node.get_size()) < 3:
+            return str(default_value)
+        batch_stride = node.get_stride()[0]
+        if V.graph.sizevars.statically_known_leq(batch_stride, 1):
+            return str(batch_stride)
+        return "{}*{}".format(
+            self.find_symbol(node, "size", dim=1) or node.get_size()[1],
+            self.find_symbol(node, "size", dim=2) or node.get_size()[2],
+        )
+    def row_or_column_stride(self, node: IRNode, default_value: int = 0) -> str:
+        """
+        Hook called from template code to get the row or column stride of an arg.
+        This is required by some CUTLASS 2.X APIs.
+        If the node is in row_major, it returns stride[-2].
+        If the node is in column_major, it returns stride[-1].
+        TODO: Will add needed args to pass it in if it is dynamic.
+        """
+        if node is None or len(node.get_stride()) < 2:
+            return str(default_value)
+        stride0 = node.get_stride()[-1]
+        stride1 = node.get_stride()[-2]
+        if stride0 == 1:
+            return cexpr(self.rename_indexing(stride1))
+        elif stride1 == 1:
+            return cexpr(self.rename_indexing(stride0))
+        else:
+            raise RuntimeError(
+                f"At least 1 stride should be 1. Strides: {node.get_stride()=}"
+            )
+    def load(self, name: str, index: Expr, mode: Any = None) -> CSEVariable:
+        """
+        Mock load function for memory planning to optimize allocations properly.
+        """
+        return self.create_cse_var(name, bounds=ValueRanges.unknown())
+    def store(self, name: str, index: Expr, value: Any, mode: Any = None) -> None:
+        """
+        Mock store function for memory planning to optimize allocations properly.
+        """
+        self.store_buffer_names.add(name)
+class CUDATemplateCaller(ChoiceCaller):
+    """
+    CUDATemplateCaller
+    This class represents a caller for CUDA template kernels. It is a subclass of ChoiceCaller.
+    Attributes:
+        name (str): The name of the caller.
+        category (str): The category of the caller.
+        bmreq (CUDABenchmarkRequest): The benchmark request for the caller.
+        template_buffer (CUDATemplateBuffer): The template buffer for the caller.
+    """
+    def __init__(
+        self,
+        name: str,
+        category: str,
+        input_nodes: list[Buffer],
+        layout: Layout,
+        make_kernel_render: Callable[
+            [CUDATemplateBuffer, Optional[list[BaseSchedulerNode]]],
+            tuple[CUDATemplateKernel, functools.partial[str]],
+        ],
+        bmreq: CUDABenchmarkRequest,
+        supports_epilogue_fusion: bool,
+        template: "CUDATemplate",  # type: ignore[name-defined]
+        info_kwargs: Optional[
+            dict[str, Union[PrimitiveInfoType, list[PrimitiveInfoType]]]
+        ],  # type: ignore[type-arg]
+        description: str,
+    ) -> None:
+        super().__init__(name, input_nodes, layout, description)
+        self.category = category
+        self.make_kernel_render = make_kernel_render
+        self.bmreq = bmreq
+        self.supports_epilogue_fusion = supports_epilogue_fusion
+        self.template = template
+        self.info_kwargs = info_kwargs
+    def precompile(self) -> None:
+        assert self.bmreq is not None
+        self.bmreq.precompile()
+    def benchmark(self, *args, out) -> float:
+        assert self.bmreq is not None
+        if config.profile_bandwidth_with_do_bench_using_profiling:
+            algo = self.bmreq.make_run_fn(*args, out=out)
+            return do_bench_using_profiling(algo)
+        return self.bmreq.benchmark(*args, out=out)
+    def __str__(self) -> str:
+        return f"CUDATemplateCaller(source_file={self.bmreq.source_file})"
+    def call_name(self) -> str:
+        return f"cuda_template_kernels.{self.name}"
+    def kernel_hash_key(self) -> str:
+        """
+        Return kernel hash key that does not depend on swizzle.
+        """
+        return "-".join(
+            [
+                self.category,
+                self.bmreq.hash_key,
+            ]
+        )
+    def hash_key(self) -> str:
+        """
+        Return kernel hash key that does not depend on swizzle.
+        """
+        return "-".join(
+            [
+                self.category,
+                self.bmreq.hash_key,
+                str(self.info_dict().get("swizzle")),
+            ]
+        )
+    def info_dict(self) -> dict[str, Union[PrimitiveInfoType, list[PrimitiveInfoType]]]:
+        """Information returned here is logged to the autotune log file when that is enabled."""
+        if self.info_kwargs is not None and "op" in self.info_kwargs:
+            op: Any = self.info_kwargs["op"]
+            return {
+                "backend": "CUDA",
+                "op_type": type(op).__name__,
+                "op_conf_name": str(op.configuration_name()),
+                "op_arch": str(op.arch),
+                "tile_shape": str(op.tile_description.tile_shape),
+                "epilogue_schedule": str(op.epilogue_schedule),
+                "kernel_schedule": str(op.kernel_schedule),
+                "element_accumulator": str(op.accumulator_type()),
+                "op_name": str(op.procedural_name()),
+                "instruction_shape": str(
+                    op.tile_description.math_instruction.instruction_shape
+                ),
+                "swizzle": str(self.info_kwargs["swizzle"]),
+            }
+        else:
+            return {"backend": "CUDA", "op_type": "unknown"}
+    def output_node(self) -> TensorBox:
+        self.bmreq.update_workspace_size()
+        return TensorBox.create(
+            CUDATemplateBuffer(
+                layout=self.layout,
+                inputs=self.input_nodes,
+                make_kernel_render=self.make_kernel_render,
+                workspace_size=self.bmreq.workspace_size,
+                supports_epilogue_fusion=self.supports_epilogue_fusion,
+                template=self.template,
+            )
+        )

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cuda/cuda_template.py ADDED Viewed

	@@ -0,0 +1,318 @@

+# mypy: allow-untyped-defs
+import functools
+import hashlib
+import itertools
+from dataclasses import dataclass
+from typing import Any, Optional, TYPE_CHECKING
+from typing_extensions import override
+from unittest.mock import patch
+import sympy
+import torch
+from torch._inductor.utils import Placeholder
+from torch._logging import getArtifactLogger
+from ...autotune_process import CUDABenchmarkRequest, TensorMeta
+from ...ir import Buffer, CUDATemplateBuffer, IRNode, Layout
+from ...utils import IndentedBuffer, unique
+from ...virtualized import V
+from ..common import KernelTemplate
+from .cuda_kernel import CUDATemplateCaller, CUDATemplateKernel
+from .cutlass_utils import DTYPE_TO_CUTLASS_TYPE
+if TYPE_CHECKING:
+    from ...scheduler import BaseSchedulerNode  # noqa: TC004
+else:
+    BaseSchedulerNode = Any
+GemmOperation = Any
+autotuning_log = getArtifactLogger(__name__, "autotuning")
+@dataclass(frozen=True)
+class ArgInfo:
+    name: str
+    ty: str
+class CUDATemplate(KernelTemplate):
+    index_counter = itertools.count()
+    def __init__(
+        self,
+        name: str,
+        input_nodes: list[Buffer],
+        layout: Layout,
+        input_reorder: Optional[list[int]] = None,
+    ) -> None:
+        """
+        Baseclass for CUDA C++ Templates, derived from KernelTemplate. Not to be instantiated directly.
+        Args:
+            name (str): The name of the CUDATemplate object.
+            input_nodes (List[IRNode]): A list of input IRNodes.
+            layout (Layout): The layout of the output buffer / tensor.
+            input_reorder (Optional[List[int]]): An optional list that specifies the order of the input nodes.
+        """
+        super().__init__(name)
+        self.input_nodes = input_nodes
+        self.output_node: Buffer = Buffer(name="buf_out", layout=layout)
+        self.input_reorder = input_reorder
+        self.layout = layout
+    @staticmethod
+    def supports_epilogue_fusion(op: GemmOperation) -> bool:
+        return False
+    def generate(  # type: ignore[override]
+        self,
+        description,
+        **kwargs,
+    ) -> CUDATemplateCaller:
+        """
+        Generates the CUDA template caller object for the given GEMM template and operation. This CUDATemplateCaller
+        may be used to call and benchmark the generated CUDA kernel in a standalone manner to enable Autotuning.
+        Args:
+            kwargs: Additional keyword arguments.
+        Returns:
+            A CUDATemplateCaller object representing the generated CUDA template caller.
+        """
+        kernel_name = str(Placeholder.KERNEL_NAME)
+        with (
+            patch.object(V.graph, "get_dtype", self._fake_get_dtype(self.output_node)),
+            CUDATemplateKernel(
+                kernel_name=kernel_name,
+                runtime_arg_info=self.get_runtime_arg_info(),
+                runtime_arg_values=self.get_runtime_arg_values(**kwargs),
+            ) as kernel,
+        ):
+            code = self.render(kernel=kernel, **kwargs)
+            _, call_args, _, _ = kernel.args.python_argdefs()
+            autotuning_log.debug("Generated Code:\n%s", code)
+            autotuning_log.debug(
+                "Args: cpp_argdefs: %s, python_argdefs: %s",
+                kernel.args.cpp_argdefs(DTYPE_TO_CUTLASS_TYPE),
+                kernel.args.python_argdefs(),
+            )
+        input_reorder = (
+            self.input_reorder
+            if self.input_reorder is not None
+            else list(range(len(self.input_nodes)))
+        )
+        expected_args = list(
+            unique(self.input_nodes[idx].get_name() for idx in input_reorder)
+        )
+        expected_args.extend([self.output_node.get_name()])
+        assert list(call_args)[: len(expected_args)] == expected_args, (
+            call_args,
+            expected_args,
+        )
+        V.graph.sizevars.size_hints(map(sympy.expand, call_args[len(expected_args) :]))
+        size_args = V.graph.sizevars.size_hints(kernel.get_dynamic_shape_args())
+        extra_args = tuple(list(size_args) + self.get_runtime_arg_values(**kwargs))
+        kernel_hash = hashlib.sha256(code.encode("utf-8")).hexdigest()[:8]
+        kernel_name = f"cutlass_{kernel_hash}"
+        code = code.replace(self.name, kernel_name)
+        # create the BenchmarkRequest
+        bmreq = CUDABenchmarkRequest(
+            kernel_name=kernel_name,
+            input_tensor_meta=TensorMeta.from_irnodes(self.input_nodes),
+            output_tensor_meta=TensorMeta.from_irnodes(self.output_node),
+            extra_args=extra_args,
+            source_code=code,
+        )
+        # kwargs has "op" argument in case of CUTLASSGemmTemplate
+        op = kwargs["op"]
+        if not op:
+            supports_epilogue_fusion = False
+        else:
+            # epilogue fusion is only supported for TMA kernels
+            supports_epilogue_fusion = self.supports_epilogue_fusion(op)
+        def make_kernel_render(
+            template_node: CUDATemplateBuffer,
+            epilogue_nodes: Optional[list[BaseSchedulerNode]] = None,
+        ) -> tuple[CUDATemplateKernel, functools.partial[str]]:
+            assert supports_epilogue_fusion or not epilogue_nodes, (
+                "epilogue fusion is not supported for this kernel"
+            )
+            kernel = CUDATemplateKernel(
+                kernel_name=str(Placeholder.KERNEL_NAME),
+                runtime_arg_info=self.get_runtime_arg_info(),
+                runtime_arg_values=self.get_runtime_arg_values(**kwargs),
+            )
+            render = functools.partial(
+                self.render,
+                kernel=kernel,
+                template_buffer_node=template_node,
+                epilogue_nodes=epilogue_nodes,
+                **kwargs,  # includes "op" argument in case of CUTLASSGemmTemplate
+            )
+            return kernel, render
+        return CUDATemplateCaller(
+            kernel_name,
+            "cutlass_gemm",
+            self.input_nodes,
+            self.output_node.get_layout(),
+            make_kernel_render,
+            bmreq,
+            supports_epilogue_fusion,
+            self,
+            kwargs,
+            description,
+        )
+    def header(self) -> IndentedBuffer:
+        res = IndentedBuffer()
+        res.splice(
+            """
+                #include <exception>
+                #include <iostream>
+                #include <memory>
+                #include <random>
+                #include <vector>
+            """
+        )
+        return res
+    def globals(self) -> IndentedBuffer:
+        res = IndentedBuffer()
+        res.splice(
+            """
+                // We compile all models with -fvisibility=hidden. Any symbols that need to be
+                // exposed in the final shared library must be declared with PT_EXPORT to make
+                // them visible.
+                #ifdef __GNUC__ // Applies to any compiler with GNU extensions (clang and g++)
+                #define PT_EXPORT __attribute__((__visibility__("default")))
+                #else
+                #ifdef _WIN32
+                #define PT_EXPORT __declspec(dllexport)
+                #else
+                #define PT_EXPORT
+                #endif
+                #endif
+            """
+        )
+        return res
+    def render(self, **kwargs) -> str:
+        raise NotImplementedError
+    def get_runtime_arg_info(self) -> list[ArgInfo]:
+        return []
+    def get_runtime_arg_values(self, **kwargs) -> list[Any]:
+        return []
+class CUTLASSTemplate(CUDATemplate):
+    """
+    CUTLASSTemplate is a class that provides a template for generating CUTLASS Templates. Used as a baseclass for the
+    CUTLASSGemmTemplate, providing functionality that might also be relevant for non-GEMM CUTLASS Kernels.
+    """
+    def header(self) -> IndentedBuffer:
+        res = super().header()
+        res.splice(
+            """
+                #include "cute/tensor.hpp"
+                #include "cutlass/cutlass.h"
+                #include "cutlass/numeric_types.h"
+                #include "cutlass/tensor_ref.h"
+                #include "cutlass/util/host_tensor.h"
+                #include "cutlass/util/reference/host/tensor_fill.h"
+                #include "cutlass/util/reference/device/tensor_fill.h"
+                #include "cutlass/util/device_memory.h"
+            """
+        )
+        return res
+    def globals(self) -> IndentedBuffer:
+        res = super().globals()
+        res.splice(
+            """
+                using namespace cute;
+                #define CUTLASS_CHECK(status)                                                      \\
+                {                                                                                  \\
+                  cutlass::Status error = status;                                                  \\
+                  if (error != cutlass::Status::kSuccess) {                                        \\
+                    auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +             \\
+                        cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);        \\
+                    throw std::runtime_error(msg);                                                 \\
+                  }                                                                                \\
+                }
+                // Used as pass-through functor in EVT just for type casting / rounding
+                template <typename T>
+                struct identity_op {
+                  CUTLASS_HOST_DEVICE
+                  T operator()(T val) const { return val; }
+                };
+            """
+        )
+        return res
+    def cute_int(self, int_str: str, var_name: str) -> str:
+        res = ""
+        if int_str in ("1", "1L"):
+            res = "cute::Int<1>{}"
+        else:
+            res = int_str
+        return f"{res} /* {var_name} */"
+    _DTYPE_TO_CUTLASS = {
+        torch.float32: "float",
+        torch.float64: "double",
+        torch.float16: "cutlass::half_t",
+        torch.int32: "int32_t",
+        torch.int16: "int16_t",
+        torch.int8: "int8_t",
+        torch.uint8: "uint8_t",
+        torch.bool: "bool",
+        torch.bfloat16: "cutlass::bfloat16_t",
+        torch.float8_e4m3fn: "cutlass::float_e4m3_t",
+    }
+    _DTYPE_TO_CUTLASS_SPARSE_META = {
+        torch.int32: "uint32_t",
+        torch.int16: "uint16_t",
+    }
+    def cutlass_type_cast(self, node: IRNode, ptr: str) -> str:
+        if node is None:
+            return ptr
+        else:
+            return f"({self._DTYPE_TO_CUTLASS.get(node.get_dtype())}*)({ptr})"
+    def cutlass_sparse_meta_type_cast(self, node: IRNode, ptr: str) -> str:
+        if node is None:
+            return ptr
+        else:
+            return (
+                f"({self._DTYPE_TO_CUTLASS_SPARSE_META.get(node.get_dtype())}*)({ptr})"
+            )
+    @override
+    def get_runtime_arg_info(self) -> list[ArgInfo]:
+        return [ArgInfo("swizzle", "const uint8_t")]
+    @override
+    def get_runtime_arg_values(self, **kwargs) -> list[Any]:
+        """
+        Helper method to retrieve runtime args from generate kwargs
+        """
+        return [kwargs[arg.name] for arg in self.get_runtime_arg_info()]

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cuda/cutlass_cache.py ADDED Viewed

	@@ -0,0 +1,105 @@

+# mypy: allow-untyped-defs
+import functools
+import hashlib
+import json
+import logging
+import os
+import time
+from typing import Any, Optional
+import torch._inductor.config as config
+from torch._inductor.codecache import cutlass_key
+from torch._inductor.codegen.cuda.cuda_env import get_cuda_arch, get_cuda_version
+from torch._inductor.codegen.cuda.serialization import get_cutlass_operation_serializer
+from torch._inductor.runtime.cache_dir_utils import cache_dir
+from torch._inductor.utils import clear_on_fresh_cache
+log = logging.getLogger(__name__)
+CONFIG_PREFIX: str = "configs"
+def get_config_request_key(
+    arch: str,
+    cuda_version: str,
+    instantiation_level: str,
+) -> str:
+    """
+    Return a key for the full ops, based on cutlass key, arch, cuda version, and instantiation level.
+    """
+    hash_target = "-".join(
+        [
+            cutlass_key().hex(),
+            arch,
+            cuda_version,
+            instantiation_level,
+        ]
+    )
+    return hashlib.sha256(hash_target.encode("utf-8")).hexdigest()[0:8]
+def _generate_config_filename(request_key: str) -> str:
+    """
+    Generate a filename for the full ops.
+    """
+    return f"{CONFIG_PREFIX}_{request_key}.json"
+@clear_on_fresh_cache
+@functools.cache
+def maybe_fetch_ops() -> Optional[list[Any]]:
+    """
+    Fetch ops from databases.
+    """
+    if config.force_disable_caches:
+        return None
+    # setup
+    arch: str = get_cuda_arch()
+    # get_cuda_version might return "12.4.0" or "12.4"
+    # but we want to use "12.4"
+    version: str = ".".join(get_cuda_version().split(".")[:2])
+    instantiation_level: str = config.cuda.cutlass_instantiation_level
+    # filename and filepath
+    request_key: str = get_config_request_key(arch, version, instantiation_level)
+    filename: str = _generate_config_filename(request_key)
+    filepath: str = os.path.join(cache_dir(), filename)
+    # try fetch
+    serialized_ops: Optional[list[str]] = None
+    start_time = time.time()
+    if os.path.isfile(filepath):
+        # locally
+        try:
+            with open(filepath) as f:
+                serialized_ops = json.load(f)
+            assert isinstance(serialized_ops, list), (
+                f"Expected serialized ops is a list, got {type(serialized_ops)}"
+            )
+        except Exception as e:
+            log.warning(
+                "Failed to load CUTLASS config %s from local cache: %s",
+                filename,
+                e,
+            )
+            serialized_ops = None
+    elif config.is_fbcode():
+        from torch._inductor.fb.cutlass_remote_cache import (
+            maybe_fetch_cutlass_configs_from_remote,
+        )
+        # from remote
+        serialized_ops = maybe_fetch_cutlass_configs_from_remote(filepath)
+    if serialized_ops is None:
+        return None
+    # deserialize
+    serializer = get_cutlass_operation_serializer()
+    full_ops = [serializer.deserialize(x) for x in serialized_ops]  # type: ignore[union-attr]
+    log.info("Loaded ops from %s cache in %.3fs", filename, time.time() - start_time)
+    return full_ops

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cuda/cutlass_lib_extensions/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py ADDED Viewed

	@@ -0,0 +1,240 @@

+from typing import Any, Callable, Union
+from sympy import Expr
+from torch._inductor.ir import (
+    ComputedBuffer,
+    InputBuffer,
+    is_contiguous_strides_for_shape,
+)
+from torch.utils._ordered_set import OrderedSet
+from ..cutlass_utils import torch_dtype_to_cutlass_type, try_import_cutlass
+EpilogueFunctor = Any  # EpilogueFunctor local class defined in _trace
+Buffer = Union[ComputedBuffer, InputBuffer]
+CutlassTupleType = Any  # cutlass.backend.c_types.tuple_factory_.<locals>.TupleType
+CutlassVisitorType = Any  # cutlass.backend.c_types.visitor_factory.<locals>.VisitorType
+CutlassArgType = (
+    Any  # Can be a CutlassTupleType, CutlassVisitorType, EmptyByte, or ctype.c_void_p
+)
+if try_import_cutlass():
+    import ast
+    import ctypes
+    import textwrap
+    from typing import Union
+    from cutlass.backend.c_types import (  # type: ignore[import-untyped, import-not-found]
+        EmptyByte,
+    )
+    from cutlass.backend.epilogue import (  # type: ignore[import-untyped, import-not-found]
+        dtype2ctype,
+    )
+    from cutlass.backend.evt import (  # type: ignore[import-untyped, import-not-found]
+        EpilogueFunctorVisitor,
+    )
+    from cutlass.backend.evt.backend.emitter_base import (  # type: ignore[import-untyped, import-not-found]
+        FusionCallbacks,
+    )
+    from cutlass.backend.evt.backend.sm90_emitter import (  # type: ignore[import-untyped, import-not-found]
+        CollectiveEpilogue,
+    )
+    from cutlass.backend.evt.frontend import (  # type: ignore[import-untyped, import-not-found]
+        PythonASTFrontend,
+    )
+    from cutlass.backend.evt.ir.tensor import (  # type: ignore[import-untyped, import-not-found]
+        Tensor as CutlassTensor,
+    )
+    from cutlass_library import (
+        DataType,
+        EpilogueScheduleType,
+        LayoutType,
+        TileDescription,
+    )
+    from torch._inductor.codegen.cuda import cuda_env
+    from torch._inductor.utils import IndentedBuffer
+    _CUTLASS_C_DTYPES = OrderedSet(dtype2ctype.values())  # type: ignore[var-annotated]
+    def create_example_tensors(
+        var_name_to_buffer_name: dict[str, str],
+        name_to_buffer: dict[str, Buffer],
+        size_hint_fn: Callable[[Union[Expr, int]], int],
+    ) -> dict[str, CutlassTensor]:
+        def cutlass_tensor_from_buffer(buffer: Buffer) -> CutlassTensor:
+            shape = buffer.get_layout().size
+            stride = buffer.get_layout().stride
+            shape = tuple(size_hint_fn(x) for x in shape)
+            stride = tuple(size_hint_fn(x) for x in stride)
+            is_row_major = is_contiguous_strides_for_shape(stride, shape)
+            is_column_major = is_contiguous_strides_for_shape(stride[::-1], shape[::-1])
+            if not is_row_major and not is_column_major:
+                raise RuntimeError(
+                    f"Cannot create example tensor for {buffer.get_name()} with \
+non-contiguous layout, received stride: {stride} and shape: {shape}"
+                )
+            return CutlassTensor(
+                shape=shape,
+                layout_tag=LayoutType.RowMajor
+                if is_row_major
+                else LayoutType.ColumnMajor,
+                element=torch_dtype_to_cutlass_type(buffer.get_layout().dtype),
+            )
+        return {
+            key: cutlass_tensor_from_buffer(name_to_buffer[name])
+            for key, name in var_name_to_buffer_name.items()
+        }
+    def trace(
+        fn_src: str,
+        example_tensors: dict[str, CutlassTensor],
+        accum_type: DataType,
+        output_type: DataType,
+        tile_description: TileDescription,
+        epilogue_schedule: EpilogueScheduleType,
+        name_to_buffer: dict[str, Buffer],
+        size_hint_fn: Callable[[Union[Expr, int]], int],
+        **kwargs: dict[str, Any],
+    ) -> tuple[str, str, str]:
+        cuda_arch = int(cuda_env.get_cuda_arch())  # type: ignore[arg-type]
+        assert cuda_arch >= 90, "Only SM90+ is supported for EVT"
+        epilogue_functor = _trace(fn_src, example_tensors, cuda_arch, **kwargs)
+        visitor = EpilogueFunctorVisitor(cuda_arch, epilogue_functor)
+        fusion_callbacks = FusionCallbacks(visitor.graph, cuda_arch, emit_CD=False)
+        collective_epilogue = CollectiveEpilogue(
+            tile_description,
+            epilogue_schedule,
+            accum_type,
+            output_type,
+            fusion_callbacks,
+        )
+        evt_name, evt_code = collective_epilogue.emit()
+        evt_args = _render_argument_type(epilogue_functor, name_to_buffer, size_hint_fn)
+        return evt_name, evt_args, evt_code
+    # Based off of
+    # https://github.com/NVIDIA/cutlass/blob/df18f5e4f5de76bed8be1de8e4c245f2f5ec3020/python/cutlass/epilogue/epilogue.py#L117
+    # This is modified to enable directly passing the source code of the epilogue vs getting it from a bona-fide python function
+    # The reason for this is that inspect.getsource does not work with functions defined at runtime via exec/eval
+    def _trace(
+        fn_src: str, example_tensors: dict[str, CutlassTensor], cc: int, **kwargs: Any
+    ) -> EpilogueFunctor:
+        class EpilogueFunctor(PythonASTFrontend):
+            def __init__(self, cc: int, **kwargs: Any):
+                self.source = textwrap.dedent(fn_src)
+                super().__init__(cc, **kwargs)
+            def parse(self, example_inputs: dict[str, CutlassTensor]) -> None:
+                self.example_inputs = example_inputs
+                self.ast = ast.parse(self.source)
+                self.visit(self.ast)
+        cc = int(cuda_env.get_cuda_arch())
+        epilogue_functor = EpilogueFunctor(cc=cc, **kwargs)
+        epilogue_functor.trace(example_tensors)
+        return epilogue_functor
+    def _render_argument_type(
+        epilogue_functor: EpilogueFunctor,
+        name_to_buffer: dict[str, Buffer],
+        size_hint_fn: Callable[[Union[Expr, int]], int],
+    ) -> str:
+        epilogue_thread_type = epilogue_functor.epilogue_thread_type
+        # Fragile, but this is the only way to guarantee t is expected type because t is a local class
+        def is_nested_visitor_type(t: type) -> bool:
+            return (
+                ".".join([t.__module__, t.__qualname__])
+                == "cutlass.backend.c_types.visitor_factory.<locals>.VisitorType"
+            )
+        buffer = IndentedBuffer()
+        with buffer.set_tabwidth(2):
+            def render_argument_type(name: str, t: CutlassArgType) -> None:
+                if issubclass(t, ctypes.c_byte):
+                    buffer.writeline(f"{{}}, /* {name} */")
+                else:
+                    fields = [
+                        (
+                            fname,
+                            _get_arg_from_node(ty, name_to_buffer[name], size_hint_fn),
+                        )
+                        for fname, ty in t._fields_
+                    ]
+                    field_strs = [
+                        f"/* {fname} */ {str(field)}" for fname, field in fields
+                    ]
+                    buffer.writeline(f"{{{', '.join(field_strs)}}}, /* {name} */")
+            def render_thread_type(name: str, t: CutlassArgType) -> None:
+                if is_nested_visitor_type(t):
+                    buffer.writeline(f"{{ /* {name} */")
+                    with buffer.indent():
+                        for name, inner_t in t._fields_:
+                            render_thread_type(name, inner_t)
+                    buffer.writeline("},")
+                else:
+                    render_argument_type(name, t)
+            # unroll the recursion once to address special case formatting
+            # namely, no ending comma and no indentation for the outermost thread type
+            buffer.writeline("{ /* thread */")
+            with buffer.indent(3):
+                if is_nested_visitor_type(epilogue_thread_type):
+                    with buffer.indent():
+                        for name, inner_t in epilogue_thread_type._fields_:
+                            render_thread_type(name, inner_t)
+                else:
+                    render_argument_type("thread", epilogue_thread_type)
+                buffer.writeline("}")
+        return buffer.getvalue()
+    def _get_arg_from_node(
+        arg_ty: type, node: Buffer, size_hint_fn: Callable[[Union[Expr, int]], int]
+    ) -> str:
+        from ..cuda_template import CUTLASSTemplate
+        # Today, arguments are either a pointer to the
+        # node's memory, a stride tuple, the datatype
+        # Once again, need to check for local class type for stride tuple
+        if (
+            str(arg_ty)
+            == "<class 'cutlass.backend.c_types.tuple_factory_.<locals>.TupleType'>"
+        ):
+            DEFAULT_STRIDE_LEN = 3
+            assert len(node.get_layout().stride) <= DEFAULT_STRIDE_LEN
+            stride = [size_hint_fn(x) for x in node.get_layout().stride]
+            for _ in range(DEFAULT_STRIDE_LEN - len(stride)):
+                stride.append(0)
+            def render_stride(x: int) -> str:
+                # Handle EBO for 0 and 1
+                if x == 0:
+                    return "_0{}"
+                elif x == 1:
+                    return "_1{}"
+                else:
+                    return str(x)
+            return f"{{{', '.join([render_stride(x) for x in stride])}}}"
+        elif issubclass(arg_ty, ctypes.c_void_p):
+            return f"({CUTLASSTemplate._DTYPE_TO_CUTLASS[node.get_layout().dtype]}*) {node.get_name()}"
+        elif (
+            arg_ty in _CUTLASS_C_DTYPES
+        ):  # Assumption: this is the element dtype, this holds for all cutlass ir nodes currently
+            return f"{CUTLASSTemplate._DTYPE_TO_CUTLASS[node.get_layout().dtype]}(0)"
+        elif issubclass(arg_ty, EmptyByte):
+            return "{}"
+        raise NotImplementedError(f"Unsupported arg type: {arg_ty}")

.venv/lib/python3.12/site-packages/torch/_inductor/codegen/cuda/cutlass_lib_extensions/gemm_operation_extensions.py ADDED Viewed

	@@ -0,0 +1,411 @@

+# mypy: ignore-errors
+from ..cutlass_utils import try_import_cutlass
+# copied / modified from original at
+# https://github.com/NVIDIA/cutlass/blob/8783c41851cd3582490e04e69e0cd756a8c1db7f/tools/library/scripts/gemm_operation.py#L658
+if try_import_cutlass():
+    import enum
+    from cutlass_library.gemm_operation import *  # noqa: F401, F403
+    from cutlass_library.library import *  # noqa: F401, F403
+    _LOGGER = logging.getLogger(__name__)
+    class EmitGemmUniversal3xInstanceWithEVT:
+        """Responsible for emitting a CUTLASS 3.x template definition"""
+        def __init__(self, operation_suffix="", evt_name=None):
+            self.operation_suffix = operation_suffix
+            self.includes = [
+                "cutlass/cutlass.h",
+                "cutlass/gemm/gemm.h",
+                "cutlass/numeric_types.h",
+                "cutlass/gemm/kernel/gemm_universal.hpp",
+                "cutlass/gemm/collective/collective_builder.hpp",
+                "cutlass/epilogue/collective/collective_builder.hpp",
+            ]
+            self.builtin_epilogue_functor_template = """${epilogue_functor}<
+            ${element_d},
+            ${element_epilogue},
+            ${element_c},
+            ${element_epilogue}
+            >"""
+            self.evt_name = evt_name
+            self.gemm_template = """
+using ${operation_name}_epilogue =
+typename cutlass::epilogue::collective::CollectiveBuilder<
+    ${arch}, ${opcode_class_epi},
+    cute::Shape<cute::_${tile_shape_m}, cute::_${tile_shape_n}, cute::_${tile_shape_k}>,
+    cute::Shape<${cluster_shape_m}, ${cluster_shape_n}, ${cluster_shape_k}>,
+    ${epi_tile_mn},
+    ${element_accumulator}, ${element_epilogue},
+    ${element_c}, ${layout_c}, ${align_c},
+    ${element_d}, ${layout_d}, ${align_d},
+    ${epilogue_schedule},
+    ${epilogue_functor}
+>::CollectiveOp;
+${mixed_dtype_prepare_code}
+using ${operation_name}_mainloop =
+typename cutlass::gemm::collective::CollectiveBuilder<
+    ${arch}, ${opcode_class_main},
+    ${element_a}, ${layout_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${align_b},
+    ${element_accumulator},
+    cute::Shape<cute::_${tile_shape_m}, cute::_${tile_shape_n}, cute::_${tile_shape_k}>,
+    cute::Shape<${cluster_shape_m}, ${cluster_shape_n}, ${cluster_shape_k}>,
+    ${stages},
+    ${kernel_schedule}
+>::CollectiveOp;
+// Gemm operator ${operation_name}
+using ${operation_name}_base = cutlass::gemm::kernel::GemmUniversal<
+    ${problem_shape},
+    ${operation_name}_mainloop,
+    ${operation_name}_epilogue,
+    ${tile_scheduler}>;
+// Define named type
+struct ${operation_name} :
+public ${operation_name}_base { };
+        """
+        #
+        def instance_template(self):
+            return """
+${compile_guard_start}
+{
+    using GemmKernel = cutlass::gemm::device::GemmUniversalAdapter<${operation_name}>;
+    manifest.append(
+    new ${gemm_kind}<GemmKernel>("${operation_name}"));
+}
+${compile_guard_end}
+        """
+        def emit_block_scale_epilogue_functor(self, operation):
+            block_scaled_template = """
+            ${epilogue_functor}<
+                ${epi_vs},
+                ${element_d},
+                ${element_accumulator},
+                ${element_sfd},
+                ${layout_sfd},
+                ${element_c},
+                ${element_scalar}
+            >
+            """
+            block_scaled_values = {
+                "epi_vs": str(operation.ScaleFactorVectorSize),
+                "element_d": str(DataTypeTag[operation.D.element]),
+                "element_sfd": str(DataTypeTag[operation.ScaleFactorD.element]),
+                "layout_sfd": LayoutTag[operation.ScaleFactorD.layout],
+                "epilogue_functor": EpilogueFunctor3xTag[
+                    EpilogueFunctor3x.LinearCombinationBlockScaleFactor
+                ],
+                "element_accumulator": str(DataTypeTag[operation.accumulator_type()]),
+                "element_scalar": str(DataTypeTag[operation.accumulator_type()]),
+                "element_c": str(DataTypeTag[operation.C.element]),
+            }
+            return SubstituteTemplate(block_scaled_template, block_scaled_values)
+        @staticmethod
+        def pointerize_if_grouped(operation, layout):
+            return layout if not is_grouped(operation.gemm_kind) else layout + "* "
+        @staticmethod
+        def problem_shape(operation):
+            gemm_shape_type = "cute::Shape<int,int,int,int>"
+            grouped_gemm_shape_type = "cute::Shape<int,int,int>"
+            grouped_gemm_shape_type = (
+                "cutlass::gemm::GroupProblemShape<" + grouped_gemm_shape_type + ">"
+            )
+            return (
+                gemm_shape_type
+                if not is_grouped(operation.gemm_kind)
+                else grouped_gemm_shape_type
+            )
+        def emit(self, operation):
+            """Given a gem operation, emits a template definition of the operation"""
+            opcode_class_main = operation.tile_description.math_instruction.opcode_class
+            opcode_class_epi = opcode_class_main
+            tile_shape = operation.tile_description.tile_shape
+            instruction_shape = (
+                operation.tile_description.math_instruction.instruction_shape
+            )
+            cluster_m = operation.tile_description.cluster_shape[0]
+            cluster_n = operation.tile_description.cluster_shape[1]
+            tile_shape_m, tile_shape_n, tile_shape_k = tile_shape
+            # account for static/dynamic cluster shapes
+            cta_m = tile_shape[0] // cluster_m if cluster_m > 0 else tile_shape[0]
+            cta_n = tile_shape[1] // cluster_n if cluster_n > 0 else tile_shape[1]
+            # Shape passed to epilogue builder
+            is_sm100_kernel = operation.arch == 100
+            if is_sm100_kernel:
+                cta_m_per_mma_instruction = (
+                    2 if "2sm" in operation.procedural_name() else 1
+                )
+                if cluster_m <= 0:
+                    cta_m = cta_m // cta_m_per_mma_instruction
+                if opcode_class_main in [
+                    OpcodeClass.TensorOp,
+                    OpcodeClass.BlockScaledTensorOp,
+                ]:
+                    tile_shape_m = instruction_shape[0]
+                    tile_shape_n = instruction_shape[1]
+            # stage count set to zero indicates builder automatic stage selection
+            if operation.tile_description.stages > 0:
+                stage_count_string = f"cutlass::gemm::collective::StageCount<\
+{str(operation.tile_description.stages)}>"
+            else:
+                stage_count_string = (
+                    f"cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(\
+sizeof(typename {str(operation.procedural_name())}_epilogue::SharedStorage))>"
+                )
+            epi_tile_mn = "cutlass::epilogue::collective::EpilogueTileAuto"
+            (
+                instance_layout_A,
+                instance_layout_B,
+                instance_layout_C,
+                instance_layout_D,
+            ) = (
+                operation.A.layout,
+                operation.B.layout,
+                operation.C.layout,
+                operation.D.layout,
+            )
+            # 3.0 profiler integration only supports trivial epilogues for now
+            epilogue_vector_length = 1
+            # Support built-in epilogue functors or user-defined functions
+            if isinstance(operation.epilogue_functor, enum.Enum):
+                values = {
+                    "element_epilogue": str(DataTypeTag[operation.element_epilogue]),
+                    "epilogue_functor": EpilogueFunctor3xTag[
+                        operation.epilogue_functor
+                    ],
+                }
+                epilogue_functor = SubstituteTemplate(
+                    self.builtin_epilogue_functor_template, values
+                )
+                if (
+                    is_block_scaled(operation.gemm_kind)
+                    and operation.ScaleFactorD.element != DataType.void
+                ):
+                    epilogue_functor = self.emit_block_scale_epilogue_functor(operation)
+            else:
+                epilogue_functor = self.epilogue_functor.emit_declaration()
+            if (
+                is_block_scaled(operation.gemm_kind)
+                and operation.ScaleFactorD.element != DataType.void
+            ):
+                epilogue_functor = self.emit_block_scale_epilogue_functor(operation)
+            #
+            # Cutlass3x complex kernels' ElementA(B) is a tuple in collective mainloop builder,
+            # e.g. cute::tuple<Element, Transform>, Transform : cute::identity / cute::conjugate.
+            element_a = (
+                DataTypeTag[operation.A.element]
+                if not operation.is_complex()
+                else f"cute::tuple<{str(DataTypeTag[operation.A.element])},\
+{str(ComplexTransformTag3x[operation.A.complex_transform])}>"
+            )
+            element_b = (
+                DataTypeTag[operation.B.element]
+                if not operation.is_complex()
+                else f"cute::tuple<{str(DataTypeTag[operation.B.element])},\
+{str(ComplexTransformTag3x[operation.B.complex_transform])}>"
+            )
+            epilogue_schedule_type = EpilogueScheduleTag[operation.epilogue_schedule]
+            if opcode_class_main == OpcodeClass.BlockScaledTensorOp:
+                is_no_smem_epilogue = operation.epilogue_schedule in [
+                    EpilogueScheduleType.NoSmemWarpSpecialized1Sm,
+                    EpilogueScheduleType.NoSmemWarpSpecialized2Sm,
+                ]
+                grouped = is_grouped(operation.gemm_kind)
+                if cta_n == 256 and operation.kernel_schedule == to_grouped_schedule(
+                    KernelScheduleType.Nvf4TmaWarpSpecialized1SmSm100, grouped
+                ):
+                    epi_tile_mn = "cute::Shape<cute::_128,cute::_64>"
+                    if not is_no_smem_epilogue:
+                        epilogue_schedule_type = EpilogueScheduleTag[
+                            to_grouped_schedule(
+                                EpilogueScheduleType.TmaWarpSpecialized1Sm, grouped
+                            )
+                        ]
+                if cta_n == 256 and operation.kernel_schedule == to_grouped_schedule(
+                    KernelScheduleType.Nvf4TmaWarpSpecialized2SmSm100, grouped
+                ):
+                    epi_tile_mn = "cute::Shape<cute::_128,cute::_64>"
+                    if not is_no_smem_epilogue:
+                        epilogue_schedule_type = EpilogueScheduleTag[
+                            to_grouped_schedule(
+                                EpilogueScheduleType.TmaWarpSpecialized2Sm, grouped
+                            )
+                        ]
+                element_a = f"cute::tuple<{str(element_a)},{str(DataTypeTag[operation.ScaleFactorA])}>"
+                element_b = f"cute::tuple<{str(element_b)},{str(DataTypeTag[operation.ScaleFactorB])}>"
+            operation_name_str = operation.procedural_name()
+            layout_a_str = LayoutTag[instance_layout_A]
+            layout_b_str = LayoutTag[instance_layout_B]
+            mixed_dtype_prepare_code = ""
+            if operation.mixed_input_mode is not None:
+                A_dtype = operation.A.element
+                B_dtype = operation.B.element
+                A_dtype_bits = DataTypeSize[A_dtype]
+                B_dtype_bits = DataTypeSize[B_dtype]
+                is_A_dtype_narrow = A_dtype_bits < B_dtype_bits
+                if is_A_dtype_narrow:
+                    narrow_dtype, wide_dtype = (A_dtype, B_dtype)
+                    narrow_dtype_bits, wide_dtype_bits = (A_dtype_bits, B_dtype_bits)
+                else:
+                    narrow_dtype, wide_dtype = (B_dtype, A_dtype)
+                    narrow_dtype_bits, wide_dtype_bits = (B_dtype_bits, A_dtype_bits)
+                narrow_tag = DataTypeTag[narrow_dtype]
+                wide_tag = DataTypeTag[wide_dtype]
+                scale_tag = DataTypeTag[wide_dtype]
+                zero_tag = DataTypeTag[wide_dtype]
+                do_shuffle = False
+                value_shuffle_str = ""
+                if narrow_dtype_bits == 4 and wide_dtype_bits == 16:
+                    value_shuffle_str = "cute::Layout<cute::Shape<cute::_2,cute::_4>, \
+cute::Stride<cute::_4,cute::_1>>"
+                    do_shuffle = True
+                if narrow_dtype_bits == 8 and wide_dtype_bits == 16:
+                    value_shuffle_str = "cute::Layout<cute::Shape<cute::_2,cute::_2>, \
+cute::Stride<cute::_2,cute::_1>>"
+                    do_shuffle = True
+                do_shuffle = operation.mixed_input_shuffle and do_shuffle
+                if do_shuffle:
+                    if is_A_dtype_narrow:
+                        stride_narrow_str = (
+                            f"cutlass::detail::TagToStrideA_t<{layout_a_str}>"
+                        )
+                        layout_a_str = f"{operation_name_str}_LayoutNarrowReordered"
+                    else:
+                        stride_narrow_str = (
+                            f"cutlass::detail::TagToStrideB_t<{layout_b_str}>"
+                        )
+                        layout_b_str = f"{operation_name_str}_LayoutNarrowReordered"
+                    # The {operation_name_str}_ prefixs in mixed_dtype_prepare_code and
+                    # layout_{a, b}_str are to prevent errors in Windows platform unity build
+                    mixed_dtype_prepare_code = f"""
+            using {operation_name_str}_StrideNarrow = {stride_narrow_str};
+            using {operation_name_str}_ValueShuffle = {value_shuffle_str};
+            static constexpr int {operation_name_str}_NumShuffleAtoms = 1;
+            using {operation_name_str}_MmaAtomShape = \
+cute::Layout<cute::Shape<cute::_1, cute::Int<{operation_name_str}_NumShuffleAtoms>>>;
+            using {operation_name_str}_LayoutAtomQuant = \
+decltype(cutlass::compute_memory_reordering_atom<{wide_tag}, {operation_name_str}_MmaAtomShape, \
+{operation_name_str}_ValueShuffle>());
+            using {operation_name_str}_LayoutNarrowReordered = \
+decltype(cute::tile_to_shape({operation_name_str}_LayoutAtomQuant{{}}, \
+cute::Layout<cute::Shape<int,int,int>, {operation_name_str}_StrideNarrow>{{}}));
+                    """
+                mixed_input_modes_to_element = {
+                    MixedInputMode.ConvertOnly: narrow_tag,
+                    MixedInputMode.ScaleOnly: f"cute::tuple<{narrow_tag}, {scale_tag}>",
+                    MixedInputMode.ScaleWithZeroPoint: f"cute::tuple<{narrow_tag}, {scale_tag}, {zero_tag}>",
+                }
+                narrow_element = mixed_input_modes_to_element.get(
+                    operation.mixed_input_mode, narrow_tag
+                )
+                if narrow_dtype == DataType.s4 and (
+                    wide_dtype == DataType.e4m3 or wide_dtype == DataType.e5m2
+                ):
+                    narrow_element = (
+                        f"cute::tuple<{narrow_tag}, cutlass::Array<{scale_tag}, 8>>"
+                    )
+                if is_A_dtype_narrow:
+                    element_a = narrow_element
+                else:
+                    element_b = narrow_element
+            if self.evt_name:
+                epilogue_functor = self.evt_name
+            values = {
+                "operation_name": operation_name_str,
+                "operation_suffix": self.operation_suffix,
+                "problem_shape": self.problem_shape(operation),
+                "element_a": element_a,
+                "layout_a": self.pointerize_if_grouped(operation, layout_a_str),
+                "element_b": element_b,
+                "layout_b": self.pointerize_if_grouped(operation, layout_b_str),
+                "element_c": DataTypeTag[operation.C.element],
+                "layout_c": self.pointerize_if_grouped(
+                    operation, LayoutTag[instance_layout_C]
+                ),
+                "element_d": DataTypeTag[operation.D.element],
+                "layout_d": self.pointerize_if_grouped(
+                    operation, LayoutTag[instance_layout_D]
+                ),
+                "element_accumulator": DataTypeTag[operation.accumulator_type()],
+                "opcode_class_main": OpcodeClassTag[opcode_class_main],
+                "opcode_class_epi": OpcodeClassTag[opcode_class_epi],
+                "arch": f"cutlass::arch::Sm{operation.arch}",
+                "tile_shape_m": str(tile_shape_m),
+                "tile_shape_n": str(tile_shape_n),
+                "tile_shape_k": str(tile_shape_k),
+                "cluster_shape_m": "cute::_"
+                + str(operation.tile_description.cluster_shape[0])
+                if operation.tile_description.cluster_shape[0] > 0
+                else "int",
+                "cluster_shape_n": "cute::_"
+                + str(operation.tile_description.cluster_shape[1])
+                if operation.tile_description.cluster_shape[1] > 0
+                else "int",
+                "cluster_shape_k": "cute::_"
+                + str(operation.tile_description.cluster_shape[2])
+                if operation.tile_description.cluster_shape[2] > 0
+                else "int",
+                "instruction_shape_m": str(instruction_shape[0]),
+                "instruction_shape_n": str(instruction_shape[1]),
+                "instruction_shape_k": str(instruction_shape[2]),
+                "kernel_schedule": str(KernelScheduleTag[operation.kernel_schedule]),
+                "epilogue_schedule": str(epilogue_schedule_type),
+                "epi_tile_mn": epi_tile_mn,
+                "epilogue_functor": epilogue_functor,
+                "stages": stage_count_string,
+                "align_a": str(operation.A.alignment),
+                "align_b": str(operation.B.alignment),
+                "align_c": str(operation.C.alignment),
+                "align_d": str(operation.C.alignment),
+                "transform_a": ComplexTransformTag[operation.A.complex_transform],
+                "transform_b": ComplexTransformTag[operation.B.complex_transform],
+                "math_operation": MathOperationTag[
+                    operation.tile_description.math_instruction.math_operation
+                ],
+                "epilogue_vector_length": str(epilogue_vector_length),
+                "element_epilogue": str(DataTypeTag[operation.element_epilogue]),
+                "tile_scheduler": str(TileSchedulerTag[operation.tile_scheduler]),
+                "mixed_dtype_prepare_code": mixed_dtype_prepare_code,
+            }
+            return SubstituteTemplate(self.gemm_template, values)