Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- progress/SpecForge/cache/compiled_kernels/triton/1/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/__grp__triton_tem_fused_mul_1.json +1 -0
- progress/SpecForge/cache/compiled_kernels/triton/1/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.llir +0 -0
- progress/SpecForge/cache/compiled_kernels/triton/1/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.ttir +0 -0
- progress/SpecForge/cache/compiled_kernels/triton/1/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/__grp__triton_tem_fused_mul_1.json +1 -0
- progress/SpecForge/cache/compiled_kernels/triton/1/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.llir +0 -0
- progress/SpecForge/cache/compiled_kernels/triton/1/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.ptx +0 -0
- progress/SpecForge/cache/compiled_kernels/triton/1/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.ttgir +0 -0
- progress/SpecForge/cache/compiled_kernels/triton/1/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.ttir +0 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/__grp__triton_tem_fused_0.json +1 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.json +1 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.llir +0 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ptx +0 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.source +0 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttgir +936 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttir +780 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/__grp__triton_per_fused_mul_1.json +1 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.cubin +0 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.json +1 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.llir +1190 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.ptx +1141 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.source +391 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.ttgir +239 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.ttir +229 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/__grp__triton_poi_fused_mul_1.json +1 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.cubin +0 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.json +1 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.llir +89 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ptx +357 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.source +130 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttgir +105 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttir +104 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/4WLUGGKP4KNR26REGICEZ2CG72LG46AK5KFUG6JJKIDMZRATGGHQ/__grp__triton_poi_fused_mul_1.json +1 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/4WLUGGKP4KNR26REGICEZ2CG72LG46AK5KFUG6JJKIDMZRATGGHQ/triton_poi_fused_mul_1.cubin +0 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/4WLUGGKP4KNR26REGICEZ2CG72LG46AK5KFUG6JJKIDMZRATGGHQ/triton_poi_fused_mul_1.json +1 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/4WLUGGKP4KNR26REGICEZ2CG72LG46AK5KFUG6JJKIDMZRATGGHQ/triton_poi_fused_mul_1.llir +60 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/4WLUGGKP4KNR26REGICEZ2CG72LG46AK5KFUG6JJKIDMZRATGGHQ/triton_poi_fused_mul_1.ptx +224 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/4WLUGGKP4KNR26REGICEZ2CG72LG46AK5KFUG6JJKIDMZRATGGHQ/triton_poi_fused_mul_1.source +51 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/4WLUGGKP4KNR26REGICEZ2CG72LG46AK5KFUG6JJKIDMZRATGGHQ/triton_poi_fused_mul_1.ttgir +46 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/4WLUGGKP4KNR26REGICEZ2CG72LG46AK5KFUG6JJKIDMZRATGGHQ/triton_poi_fused_mul_1.ttir +45 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/55FURAY6LWNBQ2KK76SAMFCTAPMGJOZ6YFWFPHZ7UXAHP4Z4YGJA/__grp__triton_red_fused_mul_0.json +1 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/55FURAY6LWNBQ2KK76SAMFCTAPMGJOZ6YFWFPHZ7UXAHP4Z4YGJA/triton_red_fused_mul_0.cubin +0 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/55FURAY6LWNBQ2KK76SAMFCTAPMGJOZ6YFWFPHZ7UXAHP4Z4YGJA/triton_red_fused_mul_0.json +1 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/55FURAY6LWNBQ2KK76SAMFCTAPMGJOZ6YFWFPHZ7UXAHP4Z4YGJA/triton_red_fused_mul_0.llir +179 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/55FURAY6LWNBQ2KK76SAMFCTAPMGJOZ6YFWFPHZ7UXAHP4Z4YGJA/triton_red_fused_mul_0.ptx +424 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/55FURAY6LWNBQ2KK76SAMFCTAPMGJOZ6YFWFPHZ7UXAHP4Z4YGJA/triton_red_fused_mul_0.source +230 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/55FURAY6LWNBQ2KK76SAMFCTAPMGJOZ6YFWFPHZ7UXAHP4Z4YGJA/triton_red_fused_mul_0.ttgir +168 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/55FURAY6LWNBQ2KK76SAMFCTAPMGJOZ6YFWFPHZ7UXAHP4Z4YGJA/triton_red_fused_mul_0.ttir +163 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/__grp__triton_tem_fused_0.json +1 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.json +1 -0
- progress/SpecForge/cache/compiled_kernels/triton/2/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.llir +0 -0
progress/SpecForge/cache/compiled_kernels/triton/1/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/__grp__triton_tem_fused_mul_1.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"child_paths": {"triton_tem_fused_mul_1.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.source", "triton_tem_fused_mul_1.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.ttir", "triton_tem_fused_mul_1.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.ttgir", "triton_tem_fused_mul_1.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.llir", "triton_tem_fused_mul_1.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.ptx", "triton_tem_fused_mul_1.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.cubin", "triton_tem_fused_mul_1.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.json"}}
|
progress/SpecForge/cache/compiled_kernels/triton/1/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.llir
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
progress/SpecForge/cache/compiled_kernels/triton/1/7XYCMOPGDICSBBN4N46UWO2BLUA2ZH4YYLO2F5SVEXNOEL6NVJRQ/triton_tem_fused_mul_1.ttir
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
progress/SpecForge/cache/compiled_kernels/triton/1/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/__grp__triton_tem_fused_mul_1.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"child_paths": {"triton_tem_fused_mul_1.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.source", "triton_tem_fused_mul_1.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.ttir", "triton_tem_fused_mul_1.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.ttgir", "triton_tem_fused_mul_1.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.llir", "triton_tem_fused_mul_1.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.ptx", "triton_tem_fused_mul_1.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.cubin", "triton_tem_fused_mul_1.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/1/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.json"}}
|
progress/SpecForge/cache/compiled_kernels/triton/1/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.llir
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
progress/SpecForge/cache/compiled_kernels/triton/1/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.ptx
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
progress/SpecForge/cache/compiled_kernels/triton/1/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.ttgir
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
progress/SpecForge/cache/compiled_kernels/triton/1/PJJES3QEVXF7MPESQRKFQ4D55L4Y7YJPTGXSVMGRCNUVXD3MMXGQ/triton_tem_fused_mul_1.ttir
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
progress/SpecForge/cache/compiled_kernels/triton/2/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/__grp__triton_tem_fused_0.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"child_paths": {"triton_tem_fused_0.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.source", "triton_tem_fused_0.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttir", "triton_tem_fused_0.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttgir", "triton_tem_fused_0.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.llir", "triton_tem_fused_0.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ptx", "triton_tem_fused_0.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.cubin", "triton_tem_fused_0.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.json"}}
|
progress/SpecForge/cache/compiled_kernels/triton/2/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"hash": "d6d8579bc758efaefba73018088fcd8f749b5f25c3ab12d02ac240e2311ac409", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 131072, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_tem_fused_0"}
|
progress/SpecForge/cache/compiled_kernels/triton/2/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.llir
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
progress/SpecForge/cache/compiled_kernels/triton/2/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ptx
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
progress/SpecForge/cache/compiled_kernels/triton/2/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.source
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
progress/SpecForge/cache/compiled_kernels/triton/2/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttgir
ADDED
|
@@ -0,0 +1,936 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}>
|
| 2 |
+
#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
|
| 3 |
+
#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":18:0)
|
| 4 |
+
#loc1 = loc(unknown)
|
| 5 |
+
#loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":520:16)
|
| 6 |
+
#loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":172:41)
|
| 7 |
+
#loc83 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":403:51)
|
| 8 |
+
#loc95 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":416:34)
|
| 9 |
+
#loc131 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":198:45)
|
| 10 |
+
#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 64, 16]}>
|
| 11 |
+
#mma1 = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 128, 16]}>
|
| 12 |
+
#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
|
| 13 |
+
#shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}>
|
| 14 |
+
#smem = #ttg.shared_memory
|
| 15 |
+
#loc152 = loc("arg_Q"(#loc))
|
| 16 |
+
#loc153 = loc("arg_K"(#loc))
|
| 17 |
+
#loc154 = loc("arg_V"(#loc))
|
| 18 |
+
#loc155 = loc("arg_LSE"(#loc))
|
| 19 |
+
#loc156 = loc("arg_MAX"(#loc))
|
| 20 |
+
#loc157 = loc("arg_KV_NUM_BLKS"(#loc))
|
| 21 |
+
#loc158 = loc("arg_KV_IDX"(#loc))
|
| 22 |
+
#loc159 = loc("arg_FULL_KV_NUM_BLKS"(#loc))
|
| 23 |
+
#loc160 = loc("arg_FULL_KV_IDX"(#loc))
|
| 24 |
+
#loc161 = loc("out_ptr0"(#loc))
|
| 25 |
+
#loc162 = loc("ks0"(#loc))
|
| 26 |
+
#loc163 = loc("ks1"(#loc))
|
| 27 |
+
#loc164 = loc("ks2"(#loc))
|
| 28 |
+
#loc165 = loc("ks3"(#loc))
|
| 29 |
+
#loc166 = loc("ks4"(#loc))
|
| 30 |
+
#loc200 = loc(callsite(#loc41 at #loc42))
|
| 31 |
+
#loc239 = loc("m_ij"(#loc83))
|
| 32 |
+
#loc249 = loc("l_i"(#loc95))
|
| 33 |
+
#loc283 = loc(callsite(#loc41 at #loc131))
|
| 34 |
+
#loc345 = loc(callsite(#loc239 at #loc200))
|
| 35 |
+
#loc355 = loc(callsite(#loc249 at #loc200))
|
| 36 |
+
#loc374 = loc(callsite(#loc239 at #loc283))
|
| 37 |
+
#loc384 = loc(callsite(#loc249 at #loc283))
|
| 38 |
+
#loc406 = loc(callsite(#loc1 at #loc345))
|
| 39 |
+
#loc408 = loc(callsite(#loc1 at #loc355))
|
| 40 |
+
#loc436 = loc(callsite(#loc1 at #loc374))
|
| 41 |
+
#loc438 = loc(callsite(#loc1 at #loc384))
|
| 42 |
+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
|
| 43 |
+
tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_MAX: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("arg_MAX"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc)), %ks2: i32 loc("ks2"(#loc)), %ks3: i32 loc("ks3"(#loc)), %ks4: i32 loc("ks4"(#loc))) attributes {noinline = false} {
|
| 44 |
+
%cst = arith.constant dense<0> : tensor<1x64xi32, #mma> loc(#loc1)
|
| 45 |
+
%cst_0 = arith.constant dense<0> : tensor<128x1xi32, #mma> loc(#loc1)
|
| 46 |
+
%cst_1 = arith.constant dense<1> : tensor<128x1xi32, #mma> loc(#loc1)
|
| 47 |
+
%cst_2 = arith.constant dense<1> : tensor<1x64xi32, #mma> loc(#loc1)
|
| 48 |
+
%cst_3 = arith.constant dense<false> : tensor<128x64xi1, #mma> loc(#loc1)
|
| 49 |
+
%cst_4 = arith.constant dense<16> : tensor<1x64xi32, #mma> loc(#loc1)
|
| 50 |
+
%cst_5 = arith.constant dense<16> : tensor<128x1xi32, #mma> loc(#loc1)
|
| 51 |
+
%cst_6 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> loc(#loc1)
|
| 52 |
+
%cst_7 = arith.constant dense<1024> : tensor<64x1xi32, #blocked> loc(#loc1)
|
| 53 |
+
%cst_8 = arith.constant dense<4096> : tensor<128x1xi32, #blocked> loc(#loc1)
|
| 54 |
+
%cst_9 = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1)
|
| 55 |
+
%c2_i32 = arith.constant 2 : i32 loc(#loc1)
|
| 56 |
+
%c4_i32 = arith.constant 4 : i32 loc(#loc1)
|
| 57 |
+
%c32_i32 = arith.constant 32 : i32 loc(#loc1)
|
| 58 |
+
%c1_i32 = arith.constant 1 : i32 loc(#loc1)
|
| 59 |
+
%c128_i32 = arith.constant 128 : i32 loc(#loc1)
|
| 60 |
+
%c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
|
| 61 |
+
%cst_10 = arith.constant dense<0.000000e+00> : tensor<128x128xbf16, #blocked> loc(#loc1)
|
| 62 |
+
%cst_11 = arith.constant dense<0.000000e+00> : tensor<64x128xbf16, #blocked> loc(#loc1)
|
| 63 |
+
%c64_i32 = arith.constant 64 : i32 loc(#loc1)
|
| 64 |
+
%c63_i32 = arith.constant 63 : i32 loc(#loc1)
|
| 65 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc1)
|
| 66 |
+
%cst_12 = arith.constant dense<0.000000e+00> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1)
|
| 67 |
+
%cst_13 = arith.constant dense<1.000000e+00> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1)
|
| 68 |
+
%cst_14 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma1> loc(#loc1)
|
| 69 |
+
%cst_15 = arith.constant dense<0.0883883461> : tensor<128x64xf32, #mma> loc(#loc1)
|
| 70 |
+
%cst_16 = arith.constant dense<0xFF800000> : tensor<128x64xf32, #mma> loc(#loc1)
|
| 71 |
+
%cst_17 = arith.constant dense<1.44269502> : tensor<128x64xf32, #mma> loc(#loc1)
|
| 72 |
+
%cst_18 = arith.constant dense<0xFF800000> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1)
|
| 73 |
+
%c-1_i32 = arith.constant -1 : i32 loc(#loc1)
|
| 74 |
+
%c3_i32 = arith.constant 3 : i32 loc(#loc1)
|
| 75 |
+
%0 = arith.muli %ks0, %c4096_i32 : i32 loc(#loc2)
|
| 76 |
+
%q_start = tt.get_program_id x : i32 loc(#loc167)
|
| 77 |
+
%off_zq = tt.get_program_id y : i32 loc(#loc168)
|
| 78 |
+
%off_hq = tt.get_program_id z : i32 loc(#loc169)
|
| 79 |
+
%off_hkv = arith.divsi %off_hq, %c4_i32 : i32 loc(#loc170)
|
| 80 |
+
%q_offset = arith.muli %off_zq, %0 : i32 loc(#loc171)
|
| 81 |
+
%q_offset_19 = arith.muli %off_hq, %c128_i32 : i32 loc(#loc172)
|
| 82 |
+
%q_offset_20 = arith.addi %q_offset, %q_offset_19 : i32 loc(#loc173)
|
| 83 |
+
%k_offset = arith.muli %off_hkv, %c128_i32 : i32 loc(#loc174)
|
| 84 |
+
%Q = tt.addptr %arg_Q, %q_offset_20 : !tt.ptr<bf16>, i32 loc(#loc175)
|
| 85 |
+
%K = tt.addptr %arg_K, %k_offset : !tt.ptr<bf16>, i32 loc(#loc176)
|
| 86 |
+
%V = tt.addptr %arg_V, %k_offset : !tt.ptr<bf16>, i32 loc(#loc177)
|
| 87 |
+
%sparse_kv_idx_offset = arith.muli %q_start, %ks4 : i32 loc(#loc178)
|
| 88 |
+
%offs_m = arith.muli %q_start, %c128_i32 : i32 loc(#loc179)
|
| 89 |
+
%offs_m_21 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc180)
|
| 90 |
+
%offs_m_22 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc180)
|
| 91 |
+
%offs_m_23 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked1> loc(#loc180)
|
| 92 |
+
%offs_m_24 = tt.splat %offs_m : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc181)
|
| 93 |
+
%offs_m_25 = tt.splat %offs_m : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc181)
|
| 94 |
+
%offs_m_26 = tt.splat %offs_m : i32 -> tensor<128xi32, #blocked1> loc(#loc181)
|
| 95 |
+
%offs_m_27 = arith.addi %offs_m_24, %offs_m_21 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc181)
|
| 96 |
+
%offs_m_28 = arith.addi %offs_m_25, %offs_m_22 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc181)
|
| 97 |
+
%offs_m_29 = arith.addi %offs_m_26, %offs_m_23 : tensor<128xi32, #blocked1> loc(#loc181)
|
| 98 |
+
%ptr = tt.expand_dims %offs_m_27 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc297)
|
| 99 |
+
%ptr_30 = tt.expand_dims %offs_m_28 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xi32, #mma> loc(#loc297)
|
| 100 |
+
%ptr_31 = arith.muli %ptr, %cst_8 : tensor<128x1xi32, #blocked> loc(#loc298)
|
| 101 |
+
%ptr_32 = tt.splat %Q : !tt.ptr<bf16> -> tensor<128x1x!tt.ptr<bf16>, #blocked> loc(#loc299)
|
| 102 |
+
%ptr_33 = tt.addptr %ptr_32, %ptr_31 : tensor<128x1x!tt.ptr<bf16>, #blocked>, tensor<128x1xi32, #blocked> loc(#loc299)
|
| 103 |
+
%ptr_34 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc300)
|
| 104 |
+
%ptr_35 = tt.expand_dims %ptr_34 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc300)
|
| 105 |
+
%ptr_36 = tt.broadcast %ptr_33 : tensor<128x1x!tt.ptr<bf16>, #blocked> -> tensor<128x128x!tt.ptr<bf16>, #blocked> loc(#loc301)
|
| 106 |
+
%ptr_37 = tt.broadcast %ptr_35 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc301)
|
| 107 |
+
%ptr_38 = tt.addptr %ptr_36, %ptr_37 : tensor<128x128x!tt.ptr<bf16>, #blocked>, tensor<128x128xi32, #blocked> loc(#loc301)
|
| 108 |
+
%q = tt.splat %ks0 : i32 -> tensor<128x1xi32, #blocked> loc(#loc302)
|
| 109 |
+
%q_39 = tt.splat %ks0 : i32 -> tensor<128x1xi32, #mma> loc(#loc302)
|
| 110 |
+
%q_40 = arith.cmpi slt, %ptr, %q : tensor<128x1xi32, #blocked> loc(#loc302)
|
| 111 |
+
%q_41 = tt.broadcast %q_40 : tensor<128x1xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc303)
|
| 112 |
+
%q_42 = tt.load %ptr_38, %q_41, %cst_10 : tensor<128x128x!tt.ptr<bf16>, #blocked> loc(#loc303)
|
| 113 |
+
%q_43 = ttg.local_alloc %q_42 : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc303)
|
| 114 |
+
%kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset : !tt.ptr<i32>, i32 loc(#loc188)
|
| 115 |
+
%kv_start = tt.load %kv_indices : !tt.ptr<i32> loc(#loc189)
|
| 116 |
+
%kv_start_44 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc190)
|
| 117 |
+
%kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %q_start : !tt.ptr<i32>, i32 loc(#loc191)
|
| 118 |
+
%kv_num_blocks_45 = tt.load %kv_num_blocks : !tt.ptr<i32> loc(#loc192)
|
| 119 |
+
%block_n_end = arith.muli %kv_num_blocks_45, %c2_i32 : i32 loc(#loc193)
|
| 120 |
+
%block_n_end_46 = arith.addi %ks1, %c63_i32 : i32 loc(#loc304)
|
| 121 |
+
%block_n_end_47 = arith.divsi %block_n_end_46, %c64_i32 : i32 loc(#loc305)
|
| 122 |
+
%block_n_end_48 = arith.maxsi %block_n_end_47, %c1_i32 : i32 loc(#loc195)
|
| 123 |
+
%block_n_end_49 = arith.minsi %block_n_end, %block_n_end_48 : i32 loc(#loc196)
|
| 124 |
+
%offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc197)
|
| 125 |
+
%offs_n_50 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc197)
|
| 126 |
+
%offs_n_51 = tt.splat %kv_start_44 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc198)
|
| 127 |
+
%offs_n_52 = arith.addi %offs_n_51, %offs_n : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc198)
|
| 128 |
+
%1 = tt.expand_dims %offs_n_52 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc39)
|
| 129 |
+
%ptr_53 = tt.splat %K : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>, #blocked> loc(#loc393)
|
| 130 |
+
%ptr_54 = tt.broadcast %ptr_35 : tensor<1x128xi32, #blocked> -> tensor<64x128xi32, #blocked> loc(#loc394)
|
| 131 |
+
%k = tt.splat %ks1 : i32 -> tensor<64x1xi32, #blocked> loc(#loc395)
|
| 132 |
+
%m = arith.remsi %ptr_30, %q_39 : tensor<128x1xi32, #mma> loc(#loc396)
|
| 133 |
+
%n = tt.splat %ks1 : i32 -> tensor<1x64xi32, #mma> loc(#loc397)
|
| 134 |
+
%tmp3 = arith.cmpi slt, %m, %cst_0 : tensor<128x1xi32, #mma> loc(#loc309)
|
| 135 |
+
%tmp5 = tt.broadcast %m : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc310)
|
| 136 |
+
%tmp6 = tt.broadcast %tmp3 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc311)
|
| 137 |
+
%tmp7 = arith.cmpi sge, %m, %cst_0 : tensor<128x1xi32, #mma> loc(#loc312)
|
| 138 |
+
%tmp9 = tt.broadcast %tmp7 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc313)
|
| 139 |
+
%tmp14 = arith.remsi %m, %cst_5 : tensor<128x1xi32, #mma> loc(#loc314)
|
| 140 |
+
%tmp14_55 = arith.cmpi ne, %tmp14, %cst_0 : tensor<128x1xi32, #mma> loc(#loc315)
|
| 141 |
+
%tmp14_56 = arith.divsi %m, %cst_5 : tensor<128x1xi32, #mma> loc(#loc316)
|
| 142 |
+
%tmp14_57 = arith.subi %tmp14_56, %cst_1 : tensor<128x1xi32, #mma> loc(#loc317)
|
| 143 |
+
%tmp14_58 = arith.select %tmp14_55, %tmp14_57, %tmp14_56 : tensor<128x1xi1, #mma>, tensor<128x1xi32, #mma> loc(#loc318)
|
| 144 |
+
%tmp14_59 = arith.select %tmp3, %tmp14_58, %tmp14_56 : tensor<128x1xi1, #mma>, tensor<128x1xi32, #mma> loc(#loc319)
|
| 145 |
+
%tmp17 = tt.broadcast %tmp14_59 : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc320)
|
| 146 |
+
%ptr_60 = tt.splat %V : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>, #blocked> loc(#loc398)
|
| 147 |
+
%k_61 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc399)
|
| 148 |
+
%v = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc400)
|
| 149 |
+
%kv_offset = arith.cmpi sgt, %block_n_end_49, %c0_i32 : i32 loc(#loc462)
|
| 150 |
+
%offs_n_load = tt.splat %kv_start_44 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
|
| 151 |
+
%offs_n_load_62 = arith.addi %offs_n_load, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
|
| 152 |
+
%ptr_63 = tt.expand_dims %offs_n_load_62 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc402)
|
| 153 |
+
%ptr_64 = arith.muli %ptr_63, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc403)
|
| 154 |
+
%ptr_65 = tt.addptr %ptr_53, %ptr_64 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc393)
|
| 155 |
+
%ptr_66 = tt.broadcast %ptr_65 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc394)
|
| 156 |
+
%ptr_67 = tt.addptr %ptr_66, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc394)
|
| 157 |
+
%k_68 = arith.cmpi slt, %ptr_63, %k : tensor<64x1xi32, #blocked> loc(#loc395)
|
| 158 |
+
%k_69 = tt.broadcast %k_68 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc399)
|
| 159 |
+
%k_70 = ttg.memdesc_index %k_61[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
|
| 160 |
+
%kv_offset_71 = tt.splat %kv_offset : i1 -> tensor<64x128xi1, #blocked> loc(#loc462)
|
| 161 |
+
%kv_offset_72 = arith.andi %kv_offset_71, %k_69 : tensor<64x128xi1, #blocked> loc(#loc462)
|
| 162 |
+
%k_73 = ttg.async_copy_global_to_local %ptr_67, %k_70 mask %kv_offset_72 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
|
| 163 |
+
%k_74 = ttg.async_commit_group tokens %k_73 loc(#loc399)
|
| 164 |
+
%ptr_75 = tt.addptr %ptr_60, %ptr_64 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc398)
|
| 165 |
+
%ptr_76 = tt.broadcast %ptr_75 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc404)
|
| 166 |
+
%ptr_77 = tt.addptr %ptr_76, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc404)
|
| 167 |
+
%v_78 = ttg.memdesc_index %v[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
|
| 168 |
+
%v_79 = ttg.async_copy_global_to_local %ptr_77, %v_78 mask %kv_offset_72 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
|
| 169 |
+
%v_80 = ttg.async_commit_group tokens %v_79 loc(#loc400)
|
| 170 |
+
%kv_offset_81 = arith.cmpi sgt, %block_n_end_49, %c1_i32 : i32 loc(#loc462)
|
| 171 |
+
%kv_base_offset = arith.addi %kv_start_44, %c64_i32 : i32 loc(#loc324)
|
| 172 |
+
%offs_n_load_82 = tt.splat %kv_base_offset : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
|
| 173 |
+
%offs_n_load_83 = arith.addi %offs_n_load_82, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
|
| 174 |
+
%ptr_84 = tt.expand_dims %offs_n_load_83 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc402)
|
| 175 |
+
%ptr_85 = arith.muli %ptr_84, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc403)
|
| 176 |
+
%ptr_86 = tt.addptr %ptr_53, %ptr_85 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc393)
|
| 177 |
+
%ptr_87 = tt.broadcast %ptr_86 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc394)
|
| 178 |
+
%ptr_88 = tt.addptr %ptr_87, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc394)
|
| 179 |
+
%k_89 = arith.cmpi slt, %ptr_84, %k : tensor<64x1xi32, #blocked> loc(#loc395)
|
| 180 |
+
%k_90 = tt.broadcast %k_89 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc399)
|
| 181 |
+
%k_91 = ttg.memdesc_index %k_61[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
|
| 182 |
+
%kv_offset_92 = tt.splat %kv_offset_81 : i1 -> tensor<64x128xi1, #blocked> loc(#loc462)
|
| 183 |
+
%kv_offset_93 = arith.andi %kv_offset_92, %k_90 : tensor<64x128xi1, #blocked> loc(#loc462)
|
| 184 |
+
%k_94 = ttg.async_copy_global_to_local %ptr_88, %k_91 mask %kv_offset_93 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
|
| 185 |
+
%k_95 = ttg.async_commit_group tokens %k_94 loc(#loc399)
|
| 186 |
+
%ptr_96 = tt.addptr %ptr_60, %ptr_85 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc398)
|
| 187 |
+
%ptr_97 = tt.broadcast %ptr_96 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc404)
|
| 188 |
+
%ptr_98 = tt.addptr %ptr_97, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc404)
|
| 189 |
+
%v_99 = ttg.memdesc_index %v[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
|
| 190 |
+
%v_100 = ttg.async_copy_global_to_local %ptr_98, %v_99 mask %kv_offset_93 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
|
| 191 |
+
%v_101 = ttg.async_commit_group tokens %v_100 loc(#loc400)
|
| 192 |
+
ttng.fence_async_shared {bCluster = false} loc(#loc325)
|
| 193 |
+
%kv_offset_102:12 = scf.for %kv_offset_173 = %c0_i32 to %block_n_end_49 step %c1_i32 iter_args(%acc_174 = %cst_14, %arg17 = %cst_12, %arg18 = %cst_18, %arg19 = %c64_i32, %arg20 = %1, %arg21 = %c1_i32, %arg22 = %c-1_i32, %k_175 = %k_74, %k_176 = %k_95, %v_177 = %v_80, %v_178 = %v_101, %arg27 = %c64_i32) -> (tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32) : i32 {
|
| 194 |
+
%kv_offset_179 = arith.subi %block_n_end_49, %c2_i32 : i32 loc(#loc462)
|
| 195 |
+
%kv_offset_180 = arith.cmpi slt, %kv_offset_173, %kv_offset_179 : i32 loc(#loc462)
|
| 196 |
+
%kv_offset_181 = arith.subi %block_n_end_49, %c1_i32 : i32 loc(#loc462)
|
| 197 |
+
%kv_offset_182 = arith.cmpi slt, %kv_offset_173, %kv_offset_181 : i32 loc(#loc462)
|
| 198 |
+
%kv_offset_183 = arith.addi %arg22, %c1_i32 : i32 loc(#loc462)
|
| 199 |
+
%kv_offset_184 = arith.cmpi sge, %kv_offset_183, %c3_i32 : i32 loc(#loc462)
|
| 200 |
+
%kv_offset_185 = arith.select %kv_offset_184, %c0_i32, %kv_offset_183 : i32 loc(#loc462)
|
| 201 |
+
%k_186 = ttg.async_wait %k_175, %v_177 {num = 2 : i32} loc(#loc399)
|
| 202 |
+
%k_187 = ttg.memdesc_index %k_61[%kv_offset_185] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
|
| 203 |
+
%k_188 = ttg.memdesc_trans %k_187 {order = array<i32: 1, 0>} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc326)
|
| 204 |
+
%qk = ttng.warp_group_dot %q_43, %k_188, %cst_6 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc325)
|
| 205 |
+
%qk_189:4 = ttng.warp_group_dot_wait %qk, %q_43, %k_188, %acc_174 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64>, tensor<128x128xf32, #mma1> loc(#loc325)
|
| 206 |
+
%qk_190 = arith.mulf %qk_189#0, %cst_15 : tensor<128x64xf32, #mma> loc(#loc327)
|
| 207 |
+
%n_191 = arith.remsi %arg20, %n : tensor<1x64xi32, #mma> loc(#loc397)
|
| 208 |
+
%post_mod_scores = arith.cmpi slt, %arg20, %n : tensor<1x64xi32, #mma> loc(#loc328)
|
| 209 |
+
%post_mod_scores_192 = tt.broadcast %post_mod_scores : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc329)
|
| 210 |
+
%post_mod_scores_193 = arith.select %post_mod_scores_192, %qk_190, %cst_16 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc329)
|
| 211 |
+
%tmp5_194 = tt.broadcast %n_191 : tensor<1x64xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc310)
|
| 212 |
+
%tmp5_195 = arith.cmpi sle, %tmp5_194, %tmp5 : tensor<128x64xi32, #mma> loc(#loc310)
|
| 213 |
+
%tmp6_196 = arith.andi %tmp6, %tmp5_195 : tensor<128x64xi1, #mma> loc(#loc311)
|
| 214 |
+
%tmp8 = arith.cmpi slt, %n_191, %cst : tensor<1x64xi32, #mma> loc(#loc330)
|
| 215 |
+
%tmp9_197 = tt.broadcast %tmp8 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc313)
|
| 216 |
+
%tmp9_198 = arith.andi %tmp9, %tmp9_197 : tensor<128x64xi1, #mma> loc(#loc313)
|
| 217 |
+
%tmp10 = arith.extui %tmp8 : tensor<1x64xi1, #mma> to tensor<1x64xi32, #mma> loc(#loc331)
|
| 218 |
+
%tmp10_199 = arith.cmpi eq, %tmp10, %cst : tensor<1x64xi32, #mma> loc(#loc331)
|
| 219 |
+
%tmp11 = tt.broadcast %tmp10_199 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc332)
|
| 220 |
+
%tmp11_200 = arith.andi %tmp9, %tmp11 : tensor<128x64xi1, #mma> loc(#loc332)
|
| 221 |
+
%tmp16 = arith.remsi %n_191, %cst_4 : tensor<1x64xi32, #mma> loc(#loc333)
|
| 222 |
+
%tmp16_201 = arith.cmpi ne, %tmp16, %cst : tensor<1x64xi32, #mma> loc(#loc334)
|
| 223 |
+
%tmp16_202 = arith.divsi %n_191, %cst_4 : tensor<1x64xi32, #mma> loc(#loc335)
|
| 224 |
+
%tmp16_203 = arith.subi %tmp16_202, %cst_2 : tensor<1x64xi32, #mma> loc(#loc336)
|
| 225 |
+
%tmp16_204 = arith.select %tmp16_201, %tmp16_203, %tmp16_202 : tensor<1x64xi1, #mma>, tensor<1x64xi32, #mma> loc(#loc337)
|
| 226 |
+
%tmp16_205 = arith.select %tmp8, %tmp16_204, %tmp16_202 : tensor<1x64xi1, #mma>, tensor<1x64xi32, #mma> loc(#loc338)
|
| 227 |
+
%tmp17_206 = tt.broadcast %tmp16_205 : tensor<1x64xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc320)
|
| 228 |
+
%tmp17_207 = arith.cmpi eq, %tmp17, %tmp17_206 : tensor<128x64xi32, #mma> loc(#loc320)
|
| 229 |
+
%tmp18 = arith.andi %tmp11_200, %tmp17_207 : tensor<128x64xi1, #mma> loc(#loc339)
|
| 230 |
+
%tmp19 = arith.ori %tmp9_198, %tmp18 : tensor<128x64xi1, #mma> loc(#loc340)
|
| 231 |
+
%tmp20 = arith.ori %tmp6_196, %tmp19 : tensor<128x64xi1, #mma> loc(#loc341)
|
| 232 |
+
%mask_mod_output = arith.select %post_mod_scores_192, %tmp20, %cst_3 : tensor<128x64xi1, #mma>, tensor<128x64xi1, #mma> loc(#loc342)
|
| 233 |
+
%post_mod_scores_208 = arith.select %mask_mod_output, %post_mod_scores_193, %cst_16 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc343)
|
| 234 |
+
%post_mod_scores_209 = arith.mulf %post_mod_scores_208, %cst_17 : tensor<128x64xf32, #mma> loc(#loc344)
|
| 235 |
+
%m_ij = "tt.reduce"(%post_mod_scores_209) <{axis = 1 : i32}> ({
|
| 236 |
+
^bb0(%m_ij_267: f32 loc(callsite(#loc1 at #loc345)), %m_ij_268: f32 loc(callsite(#loc1 at #loc345))):
|
| 237 |
+
%m_ij_269 = arith.maxnumf %m_ij_267, %m_ij_268 : f32 loc(#loc457)
|
| 238 |
+
tt.reduce.return %m_ij_269 : f32 loc(#loc405)
|
| 239 |
+
}) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc405)
|
| 240 |
+
%m_ij_210 = arith.maxnumf %arg18, %m_ij : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc346)
|
| 241 |
+
%masked_out_rows = arith.cmpf oeq, %m_ij_210, %cst_18 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc347)
|
| 242 |
+
%m_ij_masked = arith.select %masked_out_rows, %cst_12, %m_ij_210 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc348)
|
| 243 |
+
%alpha = arith.subf %arg18, %m_ij_masked : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc349)
|
| 244 |
+
%alpha_211 = math.exp2 %alpha : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc350)
|
| 245 |
+
%p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc351)
|
| 246 |
+
%p_212 = tt.broadcast %p : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc352)
|
| 247 |
+
%p_213 = arith.subf %post_mod_scores_209, %p_212 : tensor<128x64xf32, #mma> loc(#loc352)
|
| 248 |
+
%p_214 = math.exp2 %p_213 : tensor<128x64xf32, #mma> loc(#loc353)
|
| 249 |
+
%l_i_215 = arith.mulf %arg17, %alpha_211 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc354)
|
| 250 |
+
%l_i_216 = "tt.reduce"(%p_214) <{axis = 1 : i32}> ({
|
| 251 |
+
^bb0(%l_i_267: f32 loc(callsite(#loc1 at #loc355)), %l_i_268: f32 loc(callsite(#loc1 at #loc355))):
|
| 252 |
+
%l_i_269 = arith.addf %l_i_267, %l_i_268 : f32 loc(#loc458)
|
| 253 |
+
tt.reduce.return %l_i_269 : f32 loc(#loc407)
|
| 254 |
+
}) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc407)
|
| 255 |
+
%l_i_217 = arith.addf %l_i_215, %l_i_216 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc356)
|
| 256 |
+
%acc_218 = tt.expand_dims %alpha_211 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc357)
|
| 257 |
+
%acc_219 = ttg.convert_layout %acc_218 : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc358)
|
| 258 |
+
%acc_220 = tt.broadcast %acc_219 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc358)
|
| 259 |
+
%acc_221 = arith.mulf %qk_189#3, %acc_220 : tensor<128x128xf32, #mma1> loc(#loc358)
|
| 260 |
+
%v_222 = ttg.memdesc_index %v[%kv_offset_185] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
|
| 261 |
+
%acc_223 = arith.truncf %p_214 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc359)
|
| 262 |
+
%acc_224 = ttg.convert_layout %acc_223 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc359)
|
| 263 |
+
%acc_225 = ttng.warp_group_dot %acc_224, %v_222, %acc_221 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc360)
|
| 264 |
+
%offs_n_226 = tt.splat %arg27 : i32 -> tensor<1x64xi32, #mma> loc(#loc361)
|
| 265 |
+
%offs_n_227 = arith.addi %arg20, %offs_n_226 : tensor<1x64xi32, #mma> loc(#loc361)
|
| 266 |
+
%kv_offset_228 = arith.addi %kv_offset_173, %c1_i32 : i32 loc(#loc462)
|
| 267 |
+
%cur_block_idx = arith.divsi %kv_offset_228, %c2_i32 : i32 loc(#loc409)
|
| 268 |
+
%cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr<i32>, i32 loc(#loc410)
|
| 269 |
+
%cur_block_229 = tt.load %cur_block, %kv_offset_182 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc411)
|
| 270 |
+
%next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc412)
|
| 271 |
+
%next_block_230 = arith.cmpi slt, %next_block, %kv_num_blocks_45 : i32 loc(#loc413)
|
| 272 |
+
%next_block_231 = tt.addptr %cur_block, %c1_i32 : !tt.ptr<i32>, i32 loc(#loc414)
|
| 273 |
+
%kv_offset_232 = arith.andi %kv_offset_182, %next_block_230 : i1 loc(#loc462)
|
| 274 |
+
%next_block_233 = tt.load %next_block_231, %kv_offset_232 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc415)
|
| 275 |
+
%needs_jump = arith.addi %kv_offset_173, %c2_i32 : i32 loc(#loc416)
|
| 276 |
+
%needs_jump_234 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc417)
|
| 277 |
+
%needs_jump_235 = arith.cmpi eq, %needs_jump_234, %c0_i32 : i32 loc(#loc418)
|
| 278 |
+
%jump_to_block = arith.subi %next_block_233, %cur_block_229 : i32 loc(#loc419)
|
| 279 |
+
%jump_to_block_236 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc420)
|
| 280 |
+
%jump_to_block_237 = arith.subi %jump_to_block_236, %c64_i32 : i32 loc(#loc421)
|
| 281 |
+
%offset = arith.extui %needs_jump_235 : i1 to i32 loc(#loc422)
|
| 282 |
+
%offset_238 = arith.muli %jump_to_block_237, %offset : i32 loc(#loc422)
|
| 283 |
+
%offset_239 = arith.subi %c1_i32, %offset : i32 loc(#loc423)
|
| 284 |
+
%offset_240 = arith.muli %offset_239, %c64_i32 : i32 loc(#loc424)
|
| 285 |
+
%offset_241 = arith.addi %offset_238, %offset_240 : i32 loc(#loc425)
|
| 286 |
+
%kv_offset_242 = arith.addi %arg19, %offset_241 : i32 loc(#loc363)
|
| 287 |
+
%kv_offset_243 = arith.addi %arg21, %c1_i32 : i32 loc(#loc462)
|
| 288 |
+
%kv_offset_244 = arith.cmpi sge, %kv_offset_243, %c3_i32 : i32 loc(#loc462)
|
| 289 |
+
%kv_offset_245 = arith.select %kv_offset_244, %c0_i32, %kv_offset_243 : i32 loc(#loc462)
|
| 290 |
+
%kv_base_offset_246 = arith.addi %kv_start_44, %kv_offset_242 : i32 loc(#loc324)
|
| 291 |
+
%offs_n_load_247 = tt.splat %kv_base_offset_246 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
|
| 292 |
+
%offs_n_load_248 = arith.addi %offs_n_load_247, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
|
| 293 |
+
%ptr_249 = tt.expand_dims %offs_n_load_248 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc402)
|
| 294 |
+
%ptr_250 = arith.muli %ptr_249, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc403)
|
| 295 |
+
%ptr_251 = tt.addptr %ptr_53, %ptr_250 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc393)
|
| 296 |
+
%ptr_252 = tt.broadcast %ptr_251 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc394)
|
| 297 |
+
%ptr_253 = tt.addptr %ptr_252, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc394)
|
| 298 |
+
%k_254 = arith.cmpi slt, %ptr_249, %k : tensor<64x1xi32, #blocked> loc(#loc395)
|
| 299 |
+
%k_255 = tt.broadcast %k_254 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc399)
|
| 300 |
+
%k_256 = ttg.memdesc_index %k_61[%kv_offset_245] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
|
| 301 |
+
%kv_offset_257 = tt.splat %kv_offset_180 : i1 -> tensor<64x128xi1, #blocked> loc(#loc462)
|
| 302 |
+
%kv_offset_258 = arith.andi %kv_offset_257, %k_255 : tensor<64x128xi1, #blocked> loc(#loc462)
|
| 303 |
+
%k_259 = ttg.async_copy_global_to_local %ptr_253, %k_256 mask %kv_offset_258 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
|
| 304 |
+
%k_260 = ttg.async_commit_group tokens %k_259 loc(#loc399)
|
| 305 |
+
%ptr_261 = tt.addptr %ptr_60, %ptr_250 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc398)
|
| 306 |
+
%ptr_262 = tt.broadcast %ptr_261 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc404)
|
| 307 |
+
%ptr_263 = tt.addptr %ptr_262, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc404)
|
| 308 |
+
%v_264 = ttg.memdesc_index %v[%kv_offset_245] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
|
| 309 |
+
%v_265 = ttg.async_copy_global_to_local %ptr_263, %v_264 mask %kv_offset_258 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
|
| 310 |
+
%v_266 = ttg.async_commit_group tokens %v_265 loc(#loc400)
|
| 311 |
+
scf.yield %acc_225, %l_i_217, %m_ij_210, %kv_offset_242, %offs_n_227, %kv_offset_245, %kv_offset_185, %k_176, %k_260, %v_178, %v_266, %offset_241 : tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc462)
|
| 312 |
+
} loc(#loc462)
|
| 313 |
+
%kv_offset_103 = ttng.warp_group_dot_wait %kv_offset_102#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc462)
|
| 314 |
+
%kv_offset_104 = ttg.async_wait {num = 0 : i32} loc(#loc462)
|
| 315 |
+
ttg.local_dealloc %v : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc462)
|
| 316 |
+
ttg.local_dealloc %k_61 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc462)
|
| 317 |
+
%kv_indices_105 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset : !tt.ptr<i32>, i32 loc(#loc275)
|
| 318 |
+
%kv_start_106 = tt.load %kv_indices_105 : !tt.ptr<i32> loc(#loc276)
|
| 319 |
+
%kv_start_107 = arith.muli %kv_start_106, %c128_i32 : i32 loc(#loc277)
|
| 320 |
+
%kv_num_blocks_108 = tt.addptr %arg_FULL_KV_NUM_BLKS, %q_start : !tt.ptr<i32>, i32 loc(#loc278)
|
| 321 |
+
%kv_num_blocks_109 = tt.load %kv_num_blocks_108 : !tt.ptr<i32> loc(#loc279)
|
| 322 |
+
%block_n_end_110 = arith.muli %kv_num_blocks_109, %c2_i32 : i32 loc(#loc280)
|
| 323 |
+
%block_n_end_111 = arith.minsi %block_n_end_110, %block_n_end_48 : i32 loc(#loc281)
|
| 324 |
+
%offs_n_112 = tt.splat %kv_start_107 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc282)
|
| 325 |
+
%offs_n_113 = arith.addi %offs_n_112, %offs_n : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc282)
|
| 326 |
+
%2 = tt.expand_dims %offs_n_113 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc130)
|
| 327 |
+
%k_114 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc426)
|
| 328 |
+
%v_115 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc427)
|
| 329 |
+
%kv_offset_116 = arith.cmpi sgt, %block_n_end_111, %c0_i32 : i32 loc(#loc463)
|
| 330 |
+
%offs_n_load_117 = tt.splat %kv_start_107 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
|
| 331 |
+
%offs_n_load_118 = arith.addi %offs_n_load_117, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
|
| 332 |
+
%ptr_119 = tt.expand_dims %offs_n_load_118 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc428)
|
| 333 |
+
%ptr_120 = arith.muli %ptr_119, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc429)
|
| 334 |
+
%ptr_121 = tt.addptr %ptr_53, %ptr_120 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc430)
|
| 335 |
+
%ptr_122 = tt.broadcast %ptr_121 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc431)
|
| 336 |
+
%ptr_123 = tt.addptr %ptr_122, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc431)
|
| 337 |
+
%k_124 = arith.cmpi slt, %ptr_119, %k : tensor<64x1xi32, #blocked> loc(#loc432)
|
| 338 |
+
%k_125 = tt.broadcast %k_124 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc426)
|
| 339 |
+
%k_126 = ttg.memdesc_index %k_114[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
|
| 340 |
+
%kv_offset_127 = tt.splat %kv_offset_116 : i1 -> tensor<64x128xi1, #blocked> loc(#loc463)
|
| 341 |
+
%kv_offset_128 = arith.andi %kv_offset_127, %k_125 : tensor<64x128xi1, #blocked> loc(#loc463)
|
| 342 |
+
%k_129 = ttg.async_copy_global_to_local %ptr_123, %k_126 mask %kv_offset_128 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
|
| 343 |
+
%k_130 = ttg.async_commit_group tokens %k_129 loc(#loc426)
|
| 344 |
+
%ptr_131 = tt.addptr %ptr_60, %ptr_120 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc433)
|
| 345 |
+
%ptr_132 = tt.broadcast %ptr_131 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc434)
|
| 346 |
+
%ptr_133 = tt.addptr %ptr_132, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc434)
|
| 347 |
+
%v_134 = ttg.memdesc_index %v_115[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
|
| 348 |
+
%v_135 = ttg.async_copy_global_to_local %ptr_133, %v_134 mask %kv_offset_128 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
|
| 349 |
+
%v_136 = ttg.async_commit_group tokens %v_135 loc(#loc427)
|
| 350 |
+
%kv_offset_137 = arith.cmpi sgt, %block_n_end_111, %c1_i32 : i32 loc(#loc463)
|
| 351 |
+
%kv_base_offset_138 = arith.addi %kv_start_107, %c64_i32 : i32 loc(#loc367)
|
| 352 |
+
%offs_n_load_139 = tt.splat %kv_base_offset_138 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
|
| 353 |
+
%offs_n_load_140 = arith.addi %offs_n_load_139, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
|
| 354 |
+
%ptr_141 = tt.expand_dims %offs_n_load_140 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc428)
|
| 355 |
+
%ptr_142 = arith.muli %ptr_141, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc429)
|
| 356 |
+
%ptr_143 = tt.addptr %ptr_53, %ptr_142 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc430)
|
| 357 |
+
%ptr_144 = tt.broadcast %ptr_143 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc431)
|
| 358 |
+
%ptr_145 = tt.addptr %ptr_144, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc431)
|
| 359 |
+
%k_146 = arith.cmpi slt, %ptr_141, %k : tensor<64x1xi32, #blocked> loc(#loc432)
|
| 360 |
+
%k_147 = tt.broadcast %k_146 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc426)
|
| 361 |
+
%k_148 = ttg.memdesc_index %k_114[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
|
| 362 |
+
%kv_offset_149 = tt.splat %kv_offset_137 : i1 -> tensor<64x128xi1, #blocked> loc(#loc463)
|
| 363 |
+
%kv_offset_150 = arith.andi %kv_offset_149, %k_147 : tensor<64x128xi1, #blocked> loc(#loc463)
|
| 364 |
+
%k_151 = ttg.async_copy_global_to_local %ptr_145, %k_148 mask %kv_offset_150 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
|
| 365 |
+
%k_152 = ttg.async_commit_group tokens %k_151 loc(#loc426)
|
| 366 |
+
%ptr_153 = tt.addptr %ptr_60, %ptr_142 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc433)
|
| 367 |
+
%ptr_154 = tt.broadcast %ptr_153 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc434)
|
| 368 |
+
%ptr_155 = tt.addptr %ptr_154, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc434)
|
| 369 |
+
%v_156 = ttg.memdesc_index %v_115[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
|
| 370 |
+
%v_157 = ttg.async_copy_global_to_local %ptr_155, %v_156 mask %kv_offset_150 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
|
| 371 |
+
%v_158 = ttg.async_commit_group tokens %v_157 loc(#loc427)
|
| 372 |
+
ttng.fence_async_shared {bCluster = false} loc(#loc368)
|
| 373 |
+
%kv_offset_159:12 = scf.for %kv_offset_173 = %c0_i32 to %block_n_end_111 step %c1_i32 iter_args(%kv_offset_174 = %kv_offset_103, %kv_offset_175 = %kv_offset_102#1, %kv_offset_176 = %kv_offset_102#2, %arg19 = %c64_i32, %arg20 = %2, %arg21 = %c1_i32, %arg22 = %c-1_i32, %k_177 = %k_130, %k_178 = %k_152, %v_179 = %v_136, %v_180 = %v_158, %arg27 = %c64_i32) -> (tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32) : i32 {
|
| 374 |
+
%kv_offset_181 = arith.subi %block_n_end_111, %c2_i32 : i32 loc(#loc463)
|
| 375 |
+
%kv_offset_182 = arith.cmpi slt, %kv_offset_173, %kv_offset_181 : i32 loc(#loc463)
|
| 376 |
+
%kv_offset_183 = arith.subi %block_n_end_111, %c1_i32 : i32 loc(#loc463)
|
| 377 |
+
%kv_offset_184 = arith.cmpi slt, %kv_offset_173, %kv_offset_183 : i32 loc(#loc463)
|
| 378 |
+
%kv_offset_185 = arith.addi %arg22, %c1_i32 : i32 loc(#loc463)
|
| 379 |
+
%kv_offset_186 = arith.cmpi sge, %kv_offset_185, %c3_i32 : i32 loc(#loc463)
|
| 380 |
+
%kv_offset_187 = arith.select %kv_offset_186, %c0_i32, %kv_offset_185 : i32 loc(#loc463)
|
| 381 |
+
%k_188 = ttg.async_wait %k_177, %v_179 {num = 2 : i32} loc(#loc426)
|
| 382 |
+
%k_189 = ttg.memdesc_index %k_114[%kv_offset_187] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
|
| 383 |
+
%k_190 = ttg.memdesc_trans %k_189 {order = array<i32: 1, 0>} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc369)
|
| 384 |
+
%qk = ttng.warp_group_dot %q_43, %k_190, %cst_6 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc368)
|
| 385 |
+
%qk_191:4 = ttng.warp_group_dot_wait %qk, %q_43, %k_190, %kv_offset_174 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64>, tensor<128x128xf32, #mma1> loc(#loc368)
|
| 386 |
+
%qk_192 = arith.mulf %qk_191#0, %cst_15 : tensor<128x64xf32, #mma> loc(#loc370)
|
| 387 |
+
%post_mod_scores = arith.cmpi slt, %arg20, %n : tensor<1x64xi32, #mma> loc(#loc371)
|
| 388 |
+
%post_mod_scores_193 = tt.broadcast %post_mod_scores : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc372)
|
| 389 |
+
%post_mod_scores_194 = arith.select %post_mod_scores_193, %qk_192, %cst_16 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc372)
|
| 390 |
+
%post_mod_scores_195 = arith.mulf %post_mod_scores_194, %cst_17 : tensor<128x64xf32, #mma> loc(#loc373)
|
| 391 |
+
%m_ij = "tt.reduce"(%post_mod_scores_195) <{axis = 1 : i32}> ({
|
| 392 |
+
^bb0(%m_ij_253: f32 loc(callsite(#loc1 at #loc374)), %m_ij_254: f32 loc(callsite(#loc1 at #loc374))):
|
| 393 |
+
%m_ij_255 = arith.maxnumf %m_ij_253, %m_ij_254 : f32 loc(#loc459)
|
| 394 |
+
tt.reduce.return %m_ij_255 : f32 loc(#loc435)
|
| 395 |
+
}) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc435)
|
| 396 |
+
%m_ij_196 = arith.maxnumf %kv_offset_176, %m_ij : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc375)
|
| 397 |
+
%masked_out_rows = arith.cmpf oeq, %m_ij_196, %cst_18 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc376)
|
| 398 |
+
%m_ij_masked = arith.select %masked_out_rows, %cst_12, %m_ij_196 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc377)
|
| 399 |
+
%alpha = arith.subf %kv_offset_176, %m_ij_masked : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc378)
|
| 400 |
+
%alpha_197 = math.exp2 %alpha : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc379)
|
| 401 |
+
%p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc380)
|
| 402 |
+
%p_198 = tt.broadcast %p : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc381)
|
| 403 |
+
%p_199 = arith.subf %post_mod_scores_195, %p_198 : tensor<128x64xf32, #mma> loc(#loc381)
|
| 404 |
+
%p_200 = math.exp2 %p_199 : tensor<128x64xf32, #mma> loc(#loc382)
|
| 405 |
+
%l_i_201 = arith.mulf %kv_offset_175, %alpha_197 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc383)
|
| 406 |
+
%l_i_202 = "tt.reduce"(%p_200) <{axis = 1 : i32}> ({
|
| 407 |
+
^bb0(%l_i_253: f32 loc(callsite(#loc1 at #loc384)), %l_i_254: f32 loc(callsite(#loc1 at #loc384))):
|
| 408 |
+
%l_i_255 = arith.addf %l_i_253, %l_i_254 : f32 loc(#loc460)
|
| 409 |
+
tt.reduce.return %l_i_255 : f32 loc(#loc437)
|
| 410 |
+
}) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc437)
|
| 411 |
+
%l_i_203 = arith.addf %l_i_201, %l_i_202 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc385)
|
| 412 |
+
%acc_204 = tt.expand_dims %alpha_197 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc386)
|
| 413 |
+
%acc_205 = ttg.convert_layout %acc_204 : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc387)
|
| 414 |
+
%acc_206 = tt.broadcast %acc_205 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc387)
|
| 415 |
+
%acc_207 = arith.mulf %qk_191#3, %acc_206 : tensor<128x128xf32, #mma1> loc(#loc387)
|
| 416 |
+
%v_208 = ttg.memdesc_index %v_115[%kv_offset_187] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
|
| 417 |
+
%acc_209 = arith.truncf %p_200 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc388)
|
| 418 |
+
%acc_210 = ttg.convert_layout %acc_209 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc388)
|
| 419 |
+
%acc_211 = ttng.warp_group_dot %acc_210, %v_208, %acc_207 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc389)
|
| 420 |
+
%offs_n_212 = tt.splat %arg27 : i32 -> tensor<1x64xi32, #mma> loc(#loc390)
|
| 421 |
+
%offs_n_213 = arith.addi %arg20, %offs_n_212 : tensor<1x64xi32, #mma> loc(#loc390)
|
| 422 |
+
%kv_offset_214 = arith.addi %kv_offset_173, %c1_i32 : i32 loc(#loc463)
|
| 423 |
+
%cur_block_idx = arith.divsi %kv_offset_214, %c2_i32 : i32 loc(#loc439)
|
| 424 |
+
%cur_block = tt.addptr %kv_indices_105, %cur_block_idx : !tt.ptr<i32>, i32 loc(#loc440)
|
| 425 |
+
%cur_block_215 = tt.load %cur_block, %kv_offset_184 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc441)
|
| 426 |
+
%next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc442)
|
| 427 |
+
%next_block_216 = arith.cmpi slt, %next_block, %kv_num_blocks_109 : i32 loc(#loc443)
|
| 428 |
+
%next_block_217 = tt.addptr %cur_block, %c1_i32 : !tt.ptr<i32>, i32 loc(#loc444)
|
| 429 |
+
%kv_offset_218 = arith.andi %kv_offset_184, %next_block_216 : i1 loc(#loc463)
|
| 430 |
+
%next_block_219 = tt.load %next_block_217, %kv_offset_218 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc445)
|
| 431 |
+
%needs_jump = arith.addi %kv_offset_173, %c2_i32 : i32 loc(#loc446)
|
| 432 |
+
%needs_jump_220 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc447)
|
| 433 |
+
%needs_jump_221 = arith.cmpi eq, %needs_jump_220, %c0_i32 : i32 loc(#loc448)
|
| 434 |
+
%jump_to_block = arith.subi %next_block_219, %cur_block_215 : i32 loc(#loc449)
|
| 435 |
+
%jump_to_block_222 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc450)
|
| 436 |
+
%jump_to_block_223 = arith.subi %jump_to_block_222, %c64_i32 : i32 loc(#loc451)
|
| 437 |
+
%offset = arith.extui %needs_jump_221 : i1 to i32 loc(#loc452)
|
| 438 |
+
%offset_224 = arith.muli %jump_to_block_223, %offset : i32 loc(#loc452)
|
| 439 |
+
%offset_225 = arith.subi %c1_i32, %offset : i32 loc(#loc453)
|
| 440 |
+
%offset_226 = arith.muli %offset_225, %c64_i32 : i32 loc(#loc454)
|
| 441 |
+
%offset_227 = arith.addi %offset_224, %offset_226 : i32 loc(#loc455)
|
| 442 |
+
%kv_offset_228 = arith.addi %arg19, %offset_227 : i32 loc(#loc392)
|
| 443 |
+
%kv_offset_229 = arith.addi %arg21, %c1_i32 : i32 loc(#loc463)
|
| 444 |
+
%kv_offset_230 = arith.cmpi sge, %kv_offset_229, %c3_i32 : i32 loc(#loc463)
|
| 445 |
+
%kv_offset_231 = arith.select %kv_offset_230, %c0_i32, %kv_offset_229 : i32 loc(#loc463)
|
| 446 |
+
%kv_base_offset_232 = arith.addi %kv_start_107, %kv_offset_228 : i32 loc(#loc367)
|
| 447 |
+
%offs_n_load_233 = tt.splat %kv_base_offset_232 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
|
| 448 |
+
%offs_n_load_234 = arith.addi %offs_n_load_233, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
|
| 449 |
+
%ptr_235 = tt.expand_dims %offs_n_load_234 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc428)
|
| 450 |
+
%ptr_236 = arith.muli %ptr_235, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc429)
|
| 451 |
+
%ptr_237 = tt.addptr %ptr_53, %ptr_236 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc430)
|
| 452 |
+
%ptr_238 = tt.broadcast %ptr_237 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc431)
|
| 453 |
+
%ptr_239 = tt.addptr %ptr_238, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc431)
|
| 454 |
+
%k_240 = arith.cmpi slt, %ptr_235, %k : tensor<64x1xi32, #blocked> loc(#loc432)
|
| 455 |
+
%k_241 = tt.broadcast %k_240 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc426)
|
| 456 |
+
%k_242 = ttg.memdesc_index %k_114[%kv_offset_231] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
|
| 457 |
+
%kv_offset_243 = tt.splat %kv_offset_182 : i1 -> tensor<64x128xi1, #blocked> loc(#loc463)
|
| 458 |
+
%kv_offset_244 = arith.andi %kv_offset_243, %k_241 : tensor<64x128xi1, #blocked> loc(#loc463)
|
| 459 |
+
%k_245 = ttg.async_copy_global_to_local %ptr_239, %k_242 mask %kv_offset_244 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
|
| 460 |
+
%k_246 = ttg.async_commit_group tokens %k_245 loc(#loc426)
|
| 461 |
+
%ptr_247 = tt.addptr %ptr_60, %ptr_236 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc433)
|
| 462 |
+
%ptr_248 = tt.broadcast %ptr_247 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc434)
|
| 463 |
+
%ptr_249 = tt.addptr %ptr_248, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc434)
|
| 464 |
+
%v_250 = ttg.memdesc_index %v_115[%kv_offset_231] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
|
| 465 |
+
%v_251 = ttg.async_copy_global_to_local %ptr_249, %v_250 mask %kv_offset_244 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
|
| 466 |
+
%v_252 = ttg.async_commit_group tokens %v_251 loc(#loc427)
|
| 467 |
+
scf.yield %acc_211, %l_i_203, %m_ij_196, %kv_offset_228, %offs_n_213, %kv_offset_231, %kv_offset_187, %k_178, %k_246, %v_180, %v_252, %offset_227 : tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc463)
|
| 468 |
+
} loc(#loc463)
|
| 469 |
+
%kv_offset_160 = ttng.warp_group_dot_wait %kv_offset_159#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc463)
|
| 470 |
+
%kv_offset_161 = ttg.async_wait {num = 0 : i32} loc(#loc463)
|
| 471 |
+
ttg.local_dealloc %v_115 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc463)
|
| 472 |
+
ttg.local_dealloc %k_114 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc463)
|
| 473 |
+
%l_i = arith.cmpf oeq, %kv_offset_159#1, %cst_12 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc284)
|
| 474 |
+
%l_i_162 = arith.select %l_i, %cst_13, %kv_offset_159#1 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc285)
|
| 475 |
+
%acc = tt.expand_dims %l_i_162 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc286)
|
| 476 |
+
%acc_163 = ttg.convert_layout %acc : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc287)
|
| 477 |
+
%acc_164 = tt.broadcast %acc_163 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc287)
|
| 478 |
+
%acc_165 = arith.divf %kv_offset_160, %acc_164 : tensor<128x128xf32, #mma1> loc(#loc287)
|
| 479 |
+
%mask = arith.cmpi slt, %ptr_35, %cst_9 : tensor<1x128xi32, #blocked> loc(#loc288)
|
| 480 |
+
%mask_166 = tt.broadcast %mask : tensor<1x128xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc289)
|
| 481 |
+
%mask_167 = arith.andi %q_41, %mask_166 : tensor<128x128xi1, #blocked> loc(#loc289)
|
| 482 |
+
%3 = tt.splat %q_offset_19 : i32 -> tensor<1x128xi32, #blocked> loc(#loc138)
|
| 483 |
+
%4 = arith.addi %ptr_35, %3 : tensor<1x128xi32, #blocked> loc(#loc138)
|
| 484 |
+
%5 = tt.broadcast %4 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc139)
|
| 485 |
+
%6 = tt.broadcast %ptr_31 : tensor<128x1xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc139)
|
| 486 |
+
%7 = arith.addi %5, %6 : tensor<128x128xi32, #blocked> loc(#loc139)
|
| 487 |
+
%8 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<128x128x!tt.ptr<bf16>, #blocked> loc(#loc140)
|
| 488 |
+
%9 = tt.addptr %8, %7 : tensor<128x128x!tt.ptr<bf16>, #blocked>, tensor<128x128xi32, #blocked> loc(#loc140)
|
| 489 |
+
%10 = arith.truncf %acc_165 : tensor<128x128xf32, #mma1> to tensor<128x128xbf16, #mma1> loc(#loc141)
|
| 490 |
+
%11 = ttg.convert_layout %10 : tensor<128x128xbf16, #mma1> -> tensor<128x128xbf16, #blocked> loc(#loc141)
|
| 491 |
+
tt.store %9, %11, %mask_167 : tensor<128x128x!tt.ptr<bf16>, #blocked> loc(#loc141)
|
| 492 |
+
%off_hz = arith.muli %off_zq, %c32_i32 : i32 loc(#loc290)
|
| 493 |
+
%off_hz_168 = arith.addi %off_hz, %off_hq : i32 loc(#loc291)
|
| 494 |
+
%l_ptrs = arith.muli %off_hz_168, %ks0 : i32 loc(#loc292)
|
| 495 |
+
%l_ptrs_169 = tt.addptr %arg_LSE, %l_ptrs : !tt.ptr<f32>, i32 loc(#loc293)
|
| 496 |
+
%l_ptrs_170 = tt.splat %l_ptrs_169 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>, #blocked1> loc(#loc294)
|
| 497 |
+
%l_ptrs_171 = tt.addptr %l_ptrs_170, %offs_m_29 : tensor<128x!tt.ptr<f32>, #blocked1>, tensor<128xi32, #blocked1> loc(#loc294)
|
| 498 |
+
%lse = math.log2 %l_i_162 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc295)
|
| 499 |
+
%lse_172 = arith.addf %kv_offset_159#2, %lse : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc296)
|
| 500 |
+
%12 = tt.splat %ks0 : i32 -> tensor<128xi32, #blocked1> loc(#loc149)
|
| 501 |
+
%13 = arith.cmpi slt, %offs_m_29, %12 : tensor<128xi32, #blocked1> loc(#loc149)
|
| 502 |
+
%14 = ttg.convert_layout %lse_172 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128xf32, #blocked1> loc(#loc150)
|
| 503 |
+
tt.store %l_ptrs_171, %14, %13 : tensor<128x!tt.ptr<f32>, #blocked1> loc(#loc150)
|
| 504 |
+
tt.return loc(#loc151)
|
| 505 |
+
} loc(#loc)
|
| 506 |
+
} loc(#loc)
|
| 507 |
+
#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":85:54)
|
| 508 |
+
#loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":97:28)
|
| 509 |
+
#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":98:27)
|
| 510 |
+
#loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":99:27)
|
| 511 |
+
#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":104:24)
|
| 512 |
+
#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":107:24)
|
| 513 |
+
#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":107:45)
|
| 514 |
+
#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":107:36)
|
| 515 |
+
#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":108:47)
|
| 516 |
+
#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":111:12)
|
| 517 |
+
#loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":112:12)
|
| 518 |
+
#loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":113:12)
|
| 519 |
+
#loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":143:97)
|
| 520 |
+
#loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":144:23)
|
| 521 |
+
#loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":144:46)
|
| 522 |
+
#loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":144:33)
|
| 523 |
+
#loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":284:27)
|
| 524 |
+
#loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":146:101)
|
| 525 |
+
#loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":284:38)
|
| 526 |
+
#loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":284:20)
|
| 527 |
+
#loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":284:56)
|
| 528 |
+
#loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":284:49)
|
| 529 |
+
#loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":292:52)
|
| 530 |
+
#loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":292:23)
|
| 531 |
+
#loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":151:26)
|
| 532 |
+
#loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":152:23)
|
| 533 |
+
#loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":152:37)
|
| 534 |
+
#loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":153:42)
|
| 535 |
+
#loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":153:28)
|
| 536 |
+
#loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":154:45)
|
| 537 |
+
#loc32 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22)
|
| 538 |
+
#loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":154:92)
|
| 539 |
+
#loc34 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28)
|
| 540 |
+
#loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":154:102)
|
| 541 |
+
#loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":154:65)
|
| 542 |
+
#loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":159:37)
|
| 543 |
+
#loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":159:24)
|
| 544 |
+
#loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":167:48)
|
| 545 |
+
#loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":347:107)
|
| 546 |
+
#loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":257:21)
|
| 547 |
+
#loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":358:36)
|
| 548 |
+
#loc45 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":359:36)
|
| 549 |
+
#loc46 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":372:22)
|
| 550 |
+
#loc47 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":374:23)
|
| 551 |
+
#loc48 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":375:22)
|
| 552 |
+
#loc49 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":376:23)
|
| 553 |
+
#loc50 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":378:22)
|
| 554 |
+
#loc51 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":383:70)
|
| 555 |
+
#loc52 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":383:79)
|
| 556 |
+
#loc53 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":383:91)
|
| 557 |
+
#loc54 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":383:99)
|
| 558 |
+
#loc55 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":383:102)
|
| 559 |
+
#loc56 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":383:119)
|
| 560 |
+
#loc57 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":386:25)
|
| 561 |
+
#loc58 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":421:107)
|
| 562 |
+
#loc59 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":484:40)
|
| 563 |
+
#loc60 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":346:35)
|
| 564 |
+
#loc61 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":342:32)
|
| 565 |
+
#loc62 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":351:19)
|
| 566 |
+
#loc63 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":349:17)
|
| 567 |
+
#loc64 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":353:14)
|
| 568 |
+
#loc65 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":367:44)
|
| 569 |
+
#loc66 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":367:69)
|
| 570 |
+
#loc67 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":377:22)
|
| 571 |
+
#loc68 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":379:24)
|
| 572 |
+
#loc69 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":380:23)
|
| 573 |
+
#loc70 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":385:70)
|
| 574 |
+
#loc71 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":385:79)
|
| 575 |
+
#loc72 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":385:91)
|
| 576 |
+
#loc73 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":385:99)
|
| 577 |
+
#loc74 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":385:102)
|
| 578 |
+
#loc75 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":385:119)
|
| 579 |
+
#loc76 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":387:24)
|
| 580 |
+
#loc77 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":388:23)
|
| 581 |
+
#loc78 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":389:23)
|
| 582 |
+
#loc79 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":394:73)
|
| 583 |
+
#loc80 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":396:69)
|
| 584 |
+
#loc81 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":399:27)
|
| 585 |
+
#loc82 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40)
|
| 586 |
+
#loc84 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27)
|
| 587 |
+
#loc85 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":403:27)
|
| 588 |
+
#loc86 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":405:35)
|
| 589 |
+
#loc87 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":406:51)
|
| 590 |
+
#loc88 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":410:31)
|
| 591 |
+
#loc89 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":410:25)
|
| 592 |
+
#loc90 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":411:51)
|
| 593 |
+
#loc91 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":411:39)
|
| 594 |
+
#loc92 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":411:21)
|
| 595 |
+
#loc93 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":416:16)
|
| 596 |
+
#loc94 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 597 |
+
#loc96 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 598 |
+
#loc97 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":416:24)
|
| 599 |
+
#loc98 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":418:22)
|
| 600 |
+
#loc99 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":418:16)
|
| 601 |
+
#loc100 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":422:22)
|
| 602 |
+
#loc101 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":422:44)
|
| 603 |
+
#loc102 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":530:26)
|
| 604 |
+
#loc103 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":247:33)
|
| 605 |
+
#loc104 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":527:63)
|
| 606 |
+
#loc105 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":248:38)
|
| 607 |
+
#loc106 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":248:24)
|
| 608 |
+
#loc107 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":249:109)
|
| 609 |
+
#loc108 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":249:113)
|
| 610 |
+
#loc109 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":249:55)
|
| 611 |
+
#loc110 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":249:25)
|
| 612 |
+
#loc111 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":250:30)
|
| 613 |
+
#loc112 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":250:35)
|
| 614 |
+
#loc113 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":250:60)
|
| 615 |
+
#loc114 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":251:34)
|
| 616 |
+
#loc115 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":251:48)
|
| 617 |
+
#loc116 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":251:63)
|
| 618 |
+
#loc117 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":252:29)
|
| 619 |
+
#loc118 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":252:47)
|
| 620 |
+
#loc119 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":252:61)
|
| 621 |
+
#loc120 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":252:42)
|
| 622 |
+
#loc121 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":531:21)
|
| 623 |
+
#loc122 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":181:35)
|
| 624 |
+
#loc123 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":182:27)
|
| 625 |
+
#loc124 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":182:41)
|
| 626 |
+
#loc125 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":183:51)
|
| 627 |
+
#loc126 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":183:32)
|
| 628 |
+
#loc127 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":184:49)
|
| 629 |
+
#loc128 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":184:69)
|
| 630 |
+
#loc129 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":186:28)
|
| 631 |
+
#loc130 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":193:52)
|
| 632 |
+
#loc132 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":206:26)
|
| 633 |
+
#loc133 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":206:34)
|
| 634 |
+
#loc134 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":208:20)
|
| 635 |
+
#loc135 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":208:16)
|
| 636 |
+
#loc136 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":214:38)
|
| 637 |
+
#loc137 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":214:30)
|
| 638 |
+
#loc138 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":218:49)
|
| 639 |
+
#loc139 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":218:62)
|
| 640 |
+
#loc140 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":218:25)
|
| 641 |
+
#loc141 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":218:92)
|
| 642 |
+
#loc142 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":221:26)
|
| 643 |
+
#loc143 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":221:31)
|
| 644 |
+
#loc144 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":222:32)
|
| 645 |
+
#loc145 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":222:23)
|
| 646 |
+
#loc146 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":222:40)
|
| 647 |
+
#loc147 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":223:33)
|
| 648 |
+
#loc148 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":223:20)
|
| 649 |
+
#loc149 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":227:48)
|
| 650 |
+
#loc150 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":227:29)
|
| 651 |
+
#loc151 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":229:4)
|
| 652 |
+
#loc167 = loc("q_start"(#loc3))
|
| 653 |
+
#loc168 = loc("off_zq"(#loc4))
|
| 654 |
+
#loc169 = loc("off_hq"(#loc5))
|
| 655 |
+
#loc170 = loc("off_hkv"(#loc6))
|
| 656 |
+
#loc171 = loc("q_offset"(#loc7))
|
| 657 |
+
#loc172 = loc("q_offset"(#loc8))
|
| 658 |
+
#loc173 = loc("q_offset"(#loc9))
|
| 659 |
+
#loc174 = loc("k_offset"(#loc10))
|
| 660 |
+
#loc175 = loc("Q"(#loc11))
|
| 661 |
+
#loc176 = loc("K"(#loc12))
|
| 662 |
+
#loc177 = loc("V"(#loc13))
|
| 663 |
+
#loc178 = loc("sparse_kv_idx_offset"(#loc14))
|
| 664 |
+
#loc179 = loc("offs_m"(#loc15))
|
| 665 |
+
#loc180 = loc("offs_m"(#loc16))
|
| 666 |
+
#loc181 = loc("offs_m"(#loc17))
|
| 667 |
+
#loc182 = loc("ptr"(#loc18))
|
| 668 |
+
#loc183 = loc("q"(#loc19))
|
| 669 |
+
#loc184 = loc("ptr"(#loc20))
|
| 670 |
+
#loc185 = loc("ptr"(#loc21))
|
| 671 |
+
#loc186 = loc("ptr"(#loc22))
|
| 672 |
+
#loc187 = loc("ptr"(#loc23))
|
| 673 |
+
#loc188 = loc("kv_indices"(#loc26))
|
| 674 |
+
#loc189 = loc("kv_start"(#loc27))
|
| 675 |
+
#loc190 = loc("kv_start"(#loc28))
|
| 676 |
+
#loc191 = loc("kv_num_blocks"(#loc29))
|
| 677 |
+
#loc192 = loc("kv_num_blocks"(#loc30))
|
| 678 |
+
#loc193 = loc("block_n_end"(#loc31))
|
| 679 |
+
#loc194 = loc("block_n_end"(#loc33))
|
| 680 |
+
#loc195 = loc("block_n_end"(#loc35))
|
| 681 |
+
#loc196 = loc("block_n_end"(#loc36))
|
| 682 |
+
#loc197 = loc("offs_n"(#loc37))
|
| 683 |
+
#loc198 = loc("offs_n"(#loc38))
|
| 684 |
+
#loc199 = loc("k"(#loc40))
|
| 685 |
+
#loc201 = loc("m"(#loc44))
|
| 686 |
+
#loc202 = loc("n"(#loc45))
|
| 687 |
+
#loc203 = loc("tmp3"(#loc46))
|
| 688 |
+
#loc204 = loc("tmp5"(#loc47))
|
| 689 |
+
#loc205 = loc("tmp6"(#loc48))
|
| 690 |
+
#loc206 = loc("tmp7"(#loc49))
|
| 691 |
+
#loc207 = loc("tmp9"(#loc50))
|
| 692 |
+
#loc208 = loc("tmp14"(#loc51))
|
| 693 |
+
#loc209 = loc("tmp14"(#loc52))
|
| 694 |
+
#loc210 = loc("tmp14"(#loc53))
|
| 695 |
+
#loc211 = loc("tmp14"(#loc54))
|
| 696 |
+
#loc212 = loc("tmp14"(#loc55))
|
| 697 |
+
#loc213 = loc("tmp14"(#loc56))
|
| 698 |
+
#loc214 = loc("tmp17"(#loc57))
|
| 699 |
+
#loc215 = loc("v"(#loc58))
|
| 700 |
+
#loc216 = loc("acc"(#loc59))
|
| 701 |
+
#loc217 = loc("offs_n_load"(#loc60))
|
| 702 |
+
#loc218 = loc("kv_base_offset"(#loc61))
|
| 703 |
+
#loc219 = loc("qk"(#loc62))
|
| 704 |
+
#loc220 = loc("k"(#loc63))
|
| 705 |
+
#loc221 = loc("qk"(#loc64))
|
| 706 |
+
#loc222 = loc("post_mod_scores"(#loc65))
|
| 707 |
+
#loc223 = loc("post_mod_scores"(#loc66))
|
| 708 |
+
#loc224 = loc("tmp8"(#loc67))
|
| 709 |
+
#loc225 = loc("tmp10"(#loc68))
|
| 710 |
+
#loc226 = loc("tmp11"(#loc69))
|
| 711 |
+
#loc227 = loc("tmp16"(#loc70))
|
| 712 |
+
#loc228 = loc("tmp16"(#loc71))
|
| 713 |
+
#loc229 = loc("tmp16"(#loc72))
|
| 714 |
+
#loc230 = loc("tmp16"(#loc73))
|
| 715 |
+
#loc231 = loc("tmp16"(#loc74))
|
| 716 |
+
#loc232 = loc("tmp16"(#loc75))
|
| 717 |
+
#loc233 = loc("tmp18"(#loc76))
|
| 718 |
+
#loc234 = loc("tmp19"(#loc77))
|
| 719 |
+
#loc235 = loc("tmp20"(#loc78))
|
| 720 |
+
#loc236 = loc("mask_mod_output"(#loc79))
|
| 721 |
+
#loc237 = loc("post_mod_scores"(#loc80))
|
| 722 |
+
#loc238 = loc("post_mod_scores"(#loc81))
|
| 723 |
+
#loc240 = loc("m_ij"(#loc85))
|
| 724 |
+
#loc241 = loc("masked_out_rows"(#loc86))
|
| 725 |
+
#loc242 = loc("m_ij_masked"(#loc87))
|
| 726 |
+
#loc243 = loc("alpha"(#loc88))
|
| 727 |
+
#loc244 = loc("alpha"(#loc89))
|
| 728 |
+
#loc245 = loc("p"(#loc90))
|
| 729 |
+
#loc246 = loc("p"(#loc91))
|
| 730 |
+
#loc247 = loc("p"(#loc92))
|
| 731 |
+
#loc248 = loc("l_i"(#loc93))
|
| 732 |
+
#loc250 = loc("l_i"(#loc97))
|
| 733 |
+
#loc251 = loc("acc"(#loc98))
|
| 734 |
+
#loc252 = loc("acc"(#loc99))
|
| 735 |
+
#loc253 = loc("acc"(#loc100))
|
| 736 |
+
#loc254 = loc("acc"(#loc101))
|
| 737 |
+
#loc255 = loc("offs_n"(#loc102))
|
| 738 |
+
#loc256 = loc("cur_block_idx"(#loc103))
|
| 739 |
+
#loc257 = loc("offset"(#loc104))
|
| 740 |
+
#loc258 = loc("cur_block"(#loc105))
|
| 741 |
+
#loc259 = loc("cur_block"(#loc106))
|
| 742 |
+
#loc260 = loc("next_block"(#loc107))
|
| 743 |
+
#loc261 = loc("next_block"(#loc108))
|
| 744 |
+
#loc262 = loc("next_block"(#loc109))
|
| 745 |
+
#loc263 = loc("next_block"(#loc110))
|
| 746 |
+
#loc264 = loc("needs_jump"(#loc111))
|
| 747 |
+
#loc265 = loc("needs_jump"(#loc112))
|
| 748 |
+
#loc266 = loc("needs_jump"(#loc113))
|
| 749 |
+
#loc267 = loc("jump_to_block"(#loc114))
|
| 750 |
+
#loc268 = loc("jump_to_block"(#loc115))
|
| 751 |
+
#loc269 = loc("jump_to_block"(#loc116))
|
| 752 |
+
#loc270 = loc("offset"(#loc117))
|
| 753 |
+
#loc271 = loc("offset"(#loc118))
|
| 754 |
+
#loc272 = loc("offset"(#loc119))
|
| 755 |
+
#loc273 = loc("offset"(#loc120))
|
| 756 |
+
#loc274 = loc("kv_offset"(#loc121))
|
| 757 |
+
#loc275 = loc("kv_indices"(#loc122))
|
| 758 |
+
#loc276 = loc("kv_start"(#loc123))
|
| 759 |
+
#loc277 = loc("kv_start"(#loc124))
|
| 760 |
+
#loc278 = loc("kv_num_blocks"(#loc125))
|
| 761 |
+
#loc279 = loc("kv_num_blocks"(#loc126))
|
| 762 |
+
#loc280 = loc("block_n_end"(#loc127))
|
| 763 |
+
#loc281 = loc("block_n_end"(#loc128))
|
| 764 |
+
#loc282 = loc("offs_n"(#loc129))
|
| 765 |
+
#loc284 = loc("l_i"(#loc132))
|
| 766 |
+
#loc285 = loc("l_i"(#loc133))
|
| 767 |
+
#loc286 = loc("acc"(#loc134))
|
| 768 |
+
#loc287 = loc("acc"(#loc135))
|
| 769 |
+
#loc288 = loc("mask"(#loc136))
|
| 770 |
+
#loc289 = loc("mask"(#loc137))
|
| 771 |
+
#loc290 = loc("off_hz"(#loc142))
|
| 772 |
+
#loc291 = loc("off_hz"(#loc143))
|
| 773 |
+
#loc292 = loc("l_ptrs"(#loc144))
|
| 774 |
+
#loc293 = loc("l_ptrs"(#loc145))
|
| 775 |
+
#loc294 = loc("l_ptrs"(#loc146))
|
| 776 |
+
#loc295 = loc("lse"(#loc147))
|
| 777 |
+
#loc296 = loc("lse"(#loc148))
|
| 778 |
+
#loc297 = loc(callsite(#loc182 at #loc183))
|
| 779 |
+
#loc298 = loc(callsite(#loc184 at #loc183))
|
| 780 |
+
#loc299 = loc(callsite(#loc185 at #loc183))
|
| 781 |
+
#loc300 = loc(callsite(#loc186 at #loc183))
|
| 782 |
+
#loc301 = loc(callsite(#loc187 at #loc183))
|
| 783 |
+
#loc302 = loc(callsite(#loc24 at #loc183))
|
| 784 |
+
#loc303 = loc(callsite(#loc25 at #loc183))
|
| 785 |
+
#loc304 = loc(callsite(#loc32 at #loc194))
|
| 786 |
+
#loc305 = loc(callsite(#loc34 at #loc194))
|
| 787 |
+
#loc306 = loc(callsite(#loc199 at #loc200))
|
| 788 |
+
#loc307 = loc(callsite(#loc201 at #loc200))
|
| 789 |
+
#loc308 = loc(callsite(#loc202 at #loc200))
|
| 790 |
+
#loc309 = loc(callsite(#loc203 at #loc200))
|
| 791 |
+
#loc310 = loc(callsite(#loc204 at #loc200))
|
| 792 |
+
#loc311 = loc(callsite(#loc205 at #loc200))
|
| 793 |
+
#loc312 = loc(callsite(#loc206 at #loc200))
|
| 794 |
+
#loc313 = loc(callsite(#loc207 at #loc200))
|
| 795 |
+
#loc314 = loc(callsite(#loc208 at #loc200))
|
| 796 |
+
#loc315 = loc(callsite(#loc209 at #loc200))
|
| 797 |
+
#loc316 = loc(callsite(#loc210 at #loc200))
|
| 798 |
+
#loc317 = loc(callsite(#loc211 at #loc200))
|
| 799 |
+
#loc318 = loc(callsite(#loc212 at #loc200))
|
| 800 |
+
#loc319 = loc(callsite(#loc213 at #loc200))
|
| 801 |
+
#loc320 = loc(callsite(#loc214 at #loc200))
|
| 802 |
+
#loc321 = loc(callsite(#loc215 at #loc200))
|
| 803 |
+
#loc322 = loc("l_i"(#loc216))
|
| 804 |
+
#loc323 = loc(callsite(#loc217 at #loc200))
|
| 805 |
+
#loc324 = loc(callsite(#loc218 at #loc200))
|
| 806 |
+
#loc325 = loc(callsite(#loc219 at #loc200))
|
| 807 |
+
#loc326 = loc(callsite(#loc220 at #loc200))
|
| 808 |
+
#loc327 = loc(callsite(#loc221 at #loc200))
|
| 809 |
+
#loc328 = loc(callsite(#loc222 at #loc200))
|
| 810 |
+
#loc329 = loc(callsite(#loc223 at #loc200))
|
| 811 |
+
#loc330 = loc(callsite(#loc224 at #loc200))
|
| 812 |
+
#loc331 = loc(callsite(#loc225 at #loc200))
|
| 813 |
+
#loc332 = loc(callsite(#loc226 at #loc200))
|
| 814 |
+
#loc333 = loc(callsite(#loc227 at #loc200))
|
| 815 |
+
#loc334 = loc(callsite(#loc228 at #loc200))
|
| 816 |
+
#loc335 = loc(callsite(#loc229 at #loc200))
|
| 817 |
+
#loc336 = loc(callsite(#loc230 at #loc200))
|
| 818 |
+
#loc337 = loc(callsite(#loc231 at #loc200))
|
| 819 |
+
#loc338 = loc(callsite(#loc232 at #loc200))
|
| 820 |
+
#loc339 = loc(callsite(#loc233 at #loc200))
|
| 821 |
+
#loc340 = loc(callsite(#loc234 at #loc200))
|
| 822 |
+
#loc341 = loc(callsite(#loc235 at #loc200))
|
| 823 |
+
#loc342 = loc(callsite(#loc236 at #loc200))
|
| 824 |
+
#loc343 = loc(callsite(#loc237 at #loc200))
|
| 825 |
+
#loc344 = loc(callsite(#loc238 at #loc200))
|
| 826 |
+
#loc346 = loc(callsite(#loc240 at #loc200))
|
| 827 |
+
#loc347 = loc(callsite(#loc241 at #loc200))
|
| 828 |
+
#loc348 = loc(callsite(#loc242 at #loc200))
|
| 829 |
+
#loc349 = loc(callsite(#loc243 at #loc200))
|
| 830 |
+
#loc350 = loc(callsite(#loc244 at #loc200))
|
| 831 |
+
#loc351 = loc(callsite(#loc245 at #loc200))
|
| 832 |
+
#loc352 = loc(callsite(#loc246 at #loc200))
|
| 833 |
+
#loc353 = loc(callsite(#loc247 at #loc200))
|
| 834 |
+
#loc354 = loc(callsite(#loc248 at #loc200))
|
| 835 |
+
#loc356 = loc(callsite(#loc250 at #loc200))
|
| 836 |
+
#loc357 = loc(callsite(#loc251 at #loc200))
|
| 837 |
+
#loc358 = loc(callsite(#loc252 at #loc200))
|
| 838 |
+
#loc359 = loc(callsite(#loc253 at #loc200))
|
| 839 |
+
#loc360 = loc(callsite(#loc254 at #loc200))
|
| 840 |
+
#loc361 = loc(callsite(#loc255 at #loc42))
|
| 841 |
+
#loc362 = loc(callsite(#loc257 at #loc42))
|
| 842 |
+
#loc363 = loc(callsite(#loc274 at #loc42))
|
| 843 |
+
#loc364 = loc(callsite(#loc199 at #loc283))
|
| 844 |
+
#loc365 = loc(callsite(#loc215 at #loc283))
|
| 845 |
+
#loc366 = loc(callsite(#loc217 at #loc283))
|
| 846 |
+
#loc367 = loc(callsite(#loc218 at #loc283))
|
| 847 |
+
#loc368 = loc(callsite(#loc219 at #loc283))
|
| 848 |
+
#loc369 = loc(callsite(#loc220 at #loc283))
|
| 849 |
+
#loc370 = loc(callsite(#loc221 at #loc283))
|
| 850 |
+
#loc371 = loc(callsite(#loc222 at #loc283))
|
| 851 |
+
#loc372 = loc(callsite(#loc223 at #loc283))
|
| 852 |
+
#loc373 = loc(callsite(#loc238 at #loc283))
|
| 853 |
+
#loc375 = loc(callsite(#loc240 at #loc283))
|
| 854 |
+
#loc376 = loc(callsite(#loc241 at #loc283))
|
| 855 |
+
#loc377 = loc(callsite(#loc242 at #loc283))
|
| 856 |
+
#loc378 = loc(callsite(#loc243 at #loc283))
|
| 857 |
+
#loc379 = loc(callsite(#loc244 at #loc283))
|
| 858 |
+
#loc380 = loc(callsite(#loc245 at #loc283))
|
| 859 |
+
#loc381 = loc(callsite(#loc246 at #loc283))
|
| 860 |
+
#loc382 = loc(callsite(#loc247 at #loc283))
|
| 861 |
+
#loc383 = loc(callsite(#loc248 at #loc283))
|
| 862 |
+
#loc385 = loc(callsite(#loc250 at #loc283))
|
| 863 |
+
#loc386 = loc(callsite(#loc251 at #loc283))
|
| 864 |
+
#loc387 = loc(callsite(#loc252 at #loc283))
|
| 865 |
+
#loc388 = loc(callsite(#loc253 at #loc283))
|
| 866 |
+
#loc389 = loc(callsite(#loc254 at #loc283))
|
| 867 |
+
#loc390 = loc(callsite(#loc255 at #loc131))
|
| 868 |
+
#loc391 = loc(callsite(#loc257 at #loc131))
|
| 869 |
+
#loc392 = loc(callsite(#loc274 at #loc131))
|
| 870 |
+
#loc393 = loc(callsite(#loc185 at #loc306))
|
| 871 |
+
#loc394 = loc(callsite(#loc187 at #loc306))
|
| 872 |
+
#loc395 = loc(callsite(#loc24 at #loc306))
|
| 873 |
+
#loc396 = loc(callsite(#loc43 at #loc307))
|
| 874 |
+
#loc397 = loc(callsite(#loc43 at #loc308))
|
| 875 |
+
#loc398 = loc(callsite(#loc185 at #loc321))
|
| 876 |
+
#loc399 = loc(callsite(#loc25 at #loc306))
|
| 877 |
+
#loc400 = loc(callsite(#loc25 at #loc321))
|
| 878 |
+
#loc401 = loc("m_i"(#loc322))
|
| 879 |
+
#loc402 = loc(callsite(#loc182 at #loc306))
|
| 880 |
+
#loc403 = loc(callsite(#loc184 at #loc306))
|
| 881 |
+
#loc404 = loc(callsite(#loc187 at #loc321))
|
| 882 |
+
#loc405 = loc(callsite(#loc82 at #loc345))
|
| 883 |
+
#loc407 = loc(callsite(#loc94 at #loc355))
|
| 884 |
+
#loc409 = loc(callsite(#loc256 at #loc362))
|
| 885 |
+
#loc410 = loc(callsite(#loc258 at #loc362))
|
| 886 |
+
#loc411 = loc(callsite(#loc259 at #loc362))
|
| 887 |
+
#loc412 = loc(callsite(#loc260 at #loc362))
|
| 888 |
+
#loc413 = loc(callsite(#loc261 at #loc362))
|
| 889 |
+
#loc414 = loc(callsite(#loc262 at #loc362))
|
| 890 |
+
#loc415 = loc(callsite(#loc263 at #loc362))
|
| 891 |
+
#loc416 = loc(callsite(#loc264 at #loc362))
|
| 892 |
+
#loc417 = loc(callsite(#loc265 at #loc362))
|
| 893 |
+
#loc418 = loc(callsite(#loc266 at #loc362))
|
| 894 |
+
#loc419 = loc(callsite(#loc267 at #loc362))
|
| 895 |
+
#loc420 = loc(callsite(#loc268 at #loc362))
|
| 896 |
+
#loc421 = loc(callsite(#loc269 at #loc362))
|
| 897 |
+
#loc422 = loc(callsite(#loc270 at #loc362))
|
| 898 |
+
#loc423 = loc(callsite(#loc271 at #loc362))
|
| 899 |
+
#loc424 = loc(callsite(#loc272 at #loc362))
|
| 900 |
+
#loc425 = loc(callsite(#loc273 at #loc362))
|
| 901 |
+
#loc426 = loc(callsite(#loc25 at #loc364))
|
| 902 |
+
#loc427 = loc(callsite(#loc25 at #loc365))
|
| 903 |
+
#loc428 = loc(callsite(#loc182 at #loc364))
|
| 904 |
+
#loc429 = loc(callsite(#loc184 at #loc364))
|
| 905 |
+
#loc430 = loc(callsite(#loc185 at #loc364))
|
| 906 |
+
#loc431 = loc(callsite(#loc187 at #loc364))
|
| 907 |
+
#loc432 = loc(callsite(#loc24 at #loc364))
|
| 908 |
+
#loc433 = loc(callsite(#loc185 at #loc365))
|
| 909 |
+
#loc434 = loc(callsite(#loc187 at #loc365))
|
| 910 |
+
#loc435 = loc(callsite(#loc82 at #loc374))
|
| 911 |
+
#loc437 = loc(callsite(#loc94 at #loc384))
|
| 912 |
+
#loc439 = loc(callsite(#loc256 at #loc391))
|
| 913 |
+
#loc440 = loc(callsite(#loc258 at #loc391))
|
| 914 |
+
#loc441 = loc(callsite(#loc259 at #loc391))
|
| 915 |
+
#loc442 = loc(callsite(#loc260 at #loc391))
|
| 916 |
+
#loc443 = loc(callsite(#loc261 at #loc391))
|
| 917 |
+
#loc444 = loc(callsite(#loc262 at #loc391))
|
| 918 |
+
#loc445 = loc(callsite(#loc263 at #loc391))
|
| 919 |
+
#loc446 = loc(callsite(#loc264 at #loc391))
|
| 920 |
+
#loc447 = loc(callsite(#loc265 at #loc391))
|
| 921 |
+
#loc448 = loc(callsite(#loc266 at #loc391))
|
| 922 |
+
#loc449 = loc(callsite(#loc267 at #loc391))
|
| 923 |
+
#loc450 = loc(callsite(#loc268 at #loc391))
|
| 924 |
+
#loc451 = loc(callsite(#loc269 at #loc391))
|
| 925 |
+
#loc452 = loc(callsite(#loc270 at #loc391))
|
| 926 |
+
#loc453 = loc(callsite(#loc271 at #loc391))
|
| 927 |
+
#loc454 = loc(callsite(#loc272 at #loc391))
|
| 928 |
+
#loc455 = loc(callsite(#loc273 at #loc391))
|
| 929 |
+
#loc456 = loc("offs_n"(#loc401))
|
| 930 |
+
#loc457 = loc(callsite(#loc84 at #loc405))
|
| 931 |
+
#loc458 = loc(callsite(#loc96 at #loc407))
|
| 932 |
+
#loc459 = loc(callsite(#loc84 at #loc435))
|
| 933 |
+
#loc460 = loc(callsite(#loc96 at #loc437))
|
| 934 |
+
#loc461 = loc("kv_offset"(#loc456))
|
| 935 |
+
#loc462 = loc(callsite(#loc461 at #loc42))
|
| 936 |
+
#loc463 = loc(callsite(#loc461 at #loc131))
|
progress/SpecForge/cache/compiled_kernels/triton/2/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttir
ADDED
|
@@ -0,0 +1,780 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":18:0)
|
| 2 |
+
#loc1 = loc(unknown)
|
| 3 |
+
#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":172:41)
|
| 4 |
+
#loc48 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":520:16)
|
| 5 |
+
#loc87 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":403:51)
|
| 6 |
+
#loc99 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":416:34)
|
| 7 |
+
#loc137 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":198:45)
|
| 8 |
+
#loc156 = loc("arg_Q"(#loc))
|
| 9 |
+
#loc157 = loc("arg_K"(#loc))
|
| 10 |
+
#loc158 = loc("arg_V"(#loc))
|
| 11 |
+
#loc159 = loc("arg_LSE"(#loc))
|
| 12 |
+
#loc160 = loc("arg_MAX"(#loc))
|
| 13 |
+
#loc161 = loc("arg_KV_NUM_BLKS"(#loc))
|
| 14 |
+
#loc162 = loc("arg_KV_IDX"(#loc))
|
| 15 |
+
#loc163 = loc("arg_FULL_KV_NUM_BLKS"(#loc))
|
| 16 |
+
#loc164 = loc("arg_FULL_KV_IDX"(#loc))
|
| 17 |
+
#loc165 = loc("out_ptr0"(#loc))
|
| 18 |
+
#loc166 = loc("ks0"(#loc))
|
| 19 |
+
#loc167 = loc("ks1"(#loc))
|
| 20 |
+
#loc168 = loc("ks2"(#loc))
|
| 21 |
+
#loc169 = loc("ks3"(#loc))
|
| 22 |
+
#loc170 = loc("ks4"(#loc))
|
| 23 |
+
#loc210 = loc(callsite(#loc48 at #loc2))
|
| 24 |
+
#loc247 = loc("m_ij"(#loc87))
|
| 25 |
+
#loc257 = loc("l_i"(#loc99))
|
| 26 |
+
#loc293 = loc(callsite(#loc48 at #loc137))
|
| 27 |
+
#loc354 = loc(callsite(#loc247 at #loc210))
|
| 28 |
+
#loc364 = loc(callsite(#loc257 at #loc210))
|
| 29 |
+
#loc383 = loc(callsite(#loc247 at #loc293))
|
| 30 |
+
#loc393 = loc(callsite(#loc257 at #loc293))
|
| 31 |
+
#loc413 = loc(callsite(#loc1 at #loc354))
|
| 32 |
+
#loc415 = loc(callsite(#loc1 at #loc364))
|
| 33 |
+
#loc443 = loc(callsite(#loc1 at #loc383))
|
| 34 |
+
#loc445 = loc(callsite(#loc1 at #loc393))
|
| 35 |
+
module {
|
| 36 |
+
tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_MAX: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("arg_MAX"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc)), %ks2: i32 loc("ks2"(#loc)), %ks3: i32 loc("ks3"(#loc)), %ks4: i32 loc("ks4"(#loc))) attributes {noinline = false} {
|
| 37 |
+
%cst = arith.constant dense<1024> : tensor<64x1xi32> loc(#loc1)
|
| 38 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc1)
|
| 39 |
+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<64x128xbf16> loc(#loc1)
|
| 40 |
+
%cst_1 = arith.constant dense<16> : tensor<1x64xi32> loc(#loc171)
|
| 41 |
+
%cst_2 = arith.constant dense<16> : tensor<128x1xi32> loc(#loc171)
|
| 42 |
+
%cst_3 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc1)
|
| 43 |
+
%cst_4 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc1)
|
| 44 |
+
%cst_5 = arith.constant dense<false> : tensor<128x64xi1> loc(#loc171)
|
| 45 |
+
%cst_6 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc171)
|
| 46 |
+
%cst_7 = arith.constant dense<1> : tensor<128x1xi32> loc(#loc171)
|
| 47 |
+
%cst_8 = arith.constant dense<0> : tensor<128x1xi32> loc(#loc171)
|
| 48 |
+
%cst_9 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc171)
|
| 49 |
+
%cst_10 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc1)
|
| 50 |
+
%cst_11 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc1)
|
| 51 |
+
%cst_12 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1)
|
| 52 |
+
%c63_i32 = arith.constant 63 : i32 loc(#loc1)
|
| 53 |
+
%c64_i32 = arith.constant 64 : i32 loc(#loc1)
|
| 54 |
+
%q = arith.constant dense<0.000000e+00> : tensor<128x128xbf16> loc(#loc306)
|
| 55 |
+
%acc = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc307)
|
| 56 |
+
%cst_13 = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc1)
|
| 57 |
+
%mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc174)
|
| 58 |
+
%l_i = arith.constant dense<1.000000e+00> : tensor<128xf32> loc(#loc175)
|
| 59 |
+
%cst_14 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc1)
|
| 60 |
+
%c2_i32 = arith.constant 2 : i32 loc(#loc1)
|
| 61 |
+
%c4_i32 = arith.constant 4 : i32 loc(#loc1)
|
| 62 |
+
%HQ = arith.constant 32 : i32 loc(#loc176)
|
| 63 |
+
%c1_i32 = arith.constant 1 : i32 loc(#loc1)
|
| 64 |
+
%c128_i32 = arith.constant 128 : i32 loc(#loc1)
|
| 65 |
+
%c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
|
| 66 |
+
%0 = arith.muli %ks0, %c4096_i32 : i32 loc(#loc10)
|
| 67 |
+
%q_start = tt.get_program_id x : i32 loc(#loc177)
|
| 68 |
+
%off_zq = tt.get_program_id y : i32 loc(#loc178)
|
| 69 |
+
%off_hq = tt.get_program_id z : i32 loc(#loc179)
|
| 70 |
+
%off_hkv = arith.divsi %off_hq, %c4_i32 : i32 loc(#loc180)
|
| 71 |
+
%q_offset = arith.muli %off_zq, %0 : i32 loc(#loc181)
|
| 72 |
+
%q_offset_15 = arith.muli %off_hq, %c128_i32 : i32 loc(#loc182)
|
| 73 |
+
%q_offset_16 = arith.addi %q_offset, %q_offset_15 : i32 loc(#loc183)
|
| 74 |
+
%k_offset = arith.muli %off_hkv, %c128_i32 : i32 loc(#loc184)
|
| 75 |
+
%Q = tt.addptr %arg_Q, %q_offset_16 : !tt.ptr<bf16>, i32 loc(#loc185)
|
| 76 |
+
%K = tt.addptr %arg_K, %k_offset : !tt.ptr<bf16>, i32 loc(#loc186)
|
| 77 |
+
%V = tt.addptr %arg_V, %k_offset : !tt.ptr<bf16>, i32 loc(#loc187)
|
| 78 |
+
%sparse_kv_idx_offset = arith.muli %q_start, %ks4 : i32 loc(#loc188)
|
| 79 |
+
%offs_m = arith.muli %q_start, %c128_i32 : i32 loc(#loc189)
|
| 80 |
+
%offs_m_17 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc190)
|
| 81 |
+
%offs_m_18 = tt.splat %offs_m : i32 -> tensor<128xi32> loc(#loc191)
|
| 82 |
+
%offs_m_19 = arith.addi %offs_m_18, %offs_m_17 : tensor<128xi32> loc(#loc191)
|
| 83 |
+
%ptr = tt.expand_dims %offs_m_19 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc308)
|
| 84 |
+
%ptr_20 = arith.muli %ptr, %cst_13 : tensor<128x1xi32> loc(#loc309)
|
| 85 |
+
%ptr_21 = tt.splat %Q : !tt.ptr<bf16> -> tensor<128x1x!tt.ptr<bf16>> loc(#loc310)
|
| 86 |
+
%ptr_22 = tt.addptr %ptr_21, %ptr_20 : tensor<128x1x!tt.ptr<bf16>>, tensor<128x1xi32> loc(#loc310)
|
| 87 |
+
%ptr_23 = tt.expand_dims %offs_m_17 {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc311)
|
| 88 |
+
%ptr_24 = tt.broadcast %ptr_22 : tensor<128x1x!tt.ptr<bf16>> -> tensor<128x128x!tt.ptr<bf16>> loc(#loc312)
|
| 89 |
+
%ptr_25 = tt.broadcast %ptr_23 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc312)
|
| 90 |
+
%ptr_26 = tt.addptr %ptr_24, %ptr_25 : tensor<128x128x!tt.ptr<bf16>>, tensor<128x128xi32> loc(#loc312)
|
| 91 |
+
%q_27 = tt.splat %ks0 : i32 -> tensor<128x1xi32> loc(#loc313)
|
| 92 |
+
%q_28 = arith.cmpi slt, %ptr, %q_27 : tensor<128x1xi32> loc(#loc313)
|
| 93 |
+
%q_29 = tt.broadcast %q_28 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc306)
|
| 94 |
+
%q_30 = tt.load %ptr_26, %q_29, %q : tensor<128x128x!tt.ptr<bf16>> loc(#loc306)
|
| 95 |
+
%kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset : !tt.ptr<i32>, i32 loc(#loc197)
|
| 96 |
+
%kv_start = tt.load %kv_indices : !tt.ptr<i32> loc(#loc198)
|
| 97 |
+
%kv_start_31 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc199)
|
| 98 |
+
%kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %q_start : !tt.ptr<i32>, i32 loc(#loc200)
|
| 99 |
+
%kv_num_blocks_32 = tt.load %kv_num_blocks : !tt.ptr<i32> loc(#loc201)
|
| 100 |
+
%block_n_end = arith.muli %kv_num_blocks_32, %c2_i32 : i32 loc(#loc202)
|
| 101 |
+
%block_n_end_33 = arith.addi %ks1, %c63_i32 : i32 loc(#loc314)
|
| 102 |
+
%block_n_end_34 = arith.divsi %block_n_end_33, %c64_i32 : i32 loc(#loc315)
|
| 103 |
+
%block_n_end_35 = arith.maxsi %block_n_end_34, %c1_i32 : i32 loc(#loc204)
|
| 104 |
+
%block_n_end_36 = arith.minsi %block_n_end, %block_n_end_35 : i32 loc(#loc205)
|
| 105 |
+
%offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc206)
|
| 106 |
+
%offs_n_37 = tt.splat %kv_start_31 : i32 -> tensor<64xi32> loc(#loc207)
|
| 107 |
+
%offs_n_38 = arith.addi %offs_n_37, %offs_n : tensor<64xi32> loc(#loc207)
|
| 108 |
+
%1 = tt.expand_dims %offs_n_38 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc45)
|
| 109 |
+
%kv_offset:5 = scf.for %start_n = %c0_i32 to %block_n_end_36 step %c1_i32 iter_args(%acc_62 = %acc, %l_i_63 = %cst_14, %m_i = %cst_3, %offs_n_64 = %1, %kv_offset_65 = %c0_i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 {
|
| 110 |
+
%kv_base_offset = arith.addi %kv_start_31, %kv_offset_65 : i32 loc(#loc317)
|
| 111 |
+
%offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc318)
|
| 112 |
+
%offs_n_load_66 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc318)
|
| 113 |
+
%ptr_67 = tt.expand_dims %offs_n_load_66 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc404)
|
| 114 |
+
%ptr_68 = arith.muli %ptr_67, %cst : tensor<64x1xi32> loc(#loc405)
|
| 115 |
+
%ptr_69 = tt.splat %K : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc406)
|
| 116 |
+
%ptr_70 = tt.addptr %ptr_69, %ptr_68 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc406)
|
| 117 |
+
%ptr_71 = tt.broadcast %ptr_70 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc407)
|
| 118 |
+
%ptr_72 = tt.broadcast %ptr_23 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc407)
|
| 119 |
+
%ptr_73 = tt.addptr %ptr_71, %ptr_72 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc407)
|
| 120 |
+
%k = tt.splat %ks1 : i32 -> tensor<64x1xi32> loc(#loc408)
|
| 121 |
+
%k_74 = arith.cmpi slt, %ptr_67, %k : tensor<64x1xi32> loc(#loc408)
|
| 122 |
+
%k_75 = tt.broadcast %k_74 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc409)
|
| 123 |
+
%k_76 = tt.load %ptr_73, %k_75, %cst_0 : tensor<64x128x!tt.ptr<bf16>> loc(#loc409)
|
| 124 |
+
%k_77 = tt.trans %k_76 {order = array<i32: 1, 0>} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc320)
|
| 125 |
+
%qk = tt.dot %q_30, %k_77, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc321)
|
| 126 |
+
%qk_78 = arith.mulf %qk, %cst_11 : tensor<128x64xf32> loc(#loc322)
|
| 127 |
+
%m = arith.remsi %ptr, %q_27 : tensor<128x1xi32> loc(#loc410)
|
| 128 |
+
%n = tt.splat %ks1 : i32 -> tensor<1x64xi32> loc(#loc411)
|
| 129 |
+
%n_79 = arith.remsi %offs_n_64, %n : tensor<1x64xi32> loc(#loc411)
|
| 130 |
+
%post_mod_scores = arith.cmpi slt, %offs_n_64, %n : tensor<1x64xi32> loc(#loc325)
|
| 131 |
+
%post_mod_scores_80 = tt.broadcast %post_mod_scores : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc326)
|
| 132 |
+
%post_mod_scores_81 = arith.select %post_mod_scores_80, %qk_78, %cst_10 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc326)
|
| 133 |
+
%tmp3 = arith.cmpi slt, %m, %cst_8 : tensor<128x1xi32> loc(#loc327)
|
| 134 |
+
%tmp5 = tt.broadcast %n_79 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc328)
|
| 135 |
+
%tmp5_82 = tt.broadcast %m : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc328)
|
| 136 |
+
%tmp5_83 = arith.cmpi sle, %tmp5, %tmp5_82 : tensor<128x64xi32> loc(#loc328)
|
| 137 |
+
%tmp6 = tt.broadcast %tmp3 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc329)
|
| 138 |
+
%tmp6_84 = arith.andi %tmp6, %tmp5_83 : tensor<128x64xi1> loc(#loc329)
|
| 139 |
+
%tmp7 = arith.cmpi sge, %m, %cst_8 : tensor<128x1xi32> loc(#loc330)
|
| 140 |
+
%tmp8 = arith.cmpi slt, %n_79, %cst_9 : tensor<1x64xi32> loc(#loc331)
|
| 141 |
+
%tmp9 = tt.broadcast %tmp7 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc332)
|
| 142 |
+
%tmp9_85 = tt.broadcast %tmp8 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc332)
|
| 143 |
+
%tmp9_86 = arith.andi %tmp9, %tmp9_85 : tensor<128x64xi1> loc(#loc332)
|
| 144 |
+
%tmp10 = arith.extui %tmp8 : tensor<1x64xi1> to tensor<1x64xi32> loc(#loc333)
|
| 145 |
+
%tmp10_87 = arith.cmpi eq, %tmp10, %cst_9 : tensor<1x64xi32> loc(#loc333)
|
| 146 |
+
%tmp11 = tt.broadcast %tmp10_87 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc334)
|
| 147 |
+
%tmp11_88 = arith.andi %tmp9, %tmp11 : tensor<128x64xi1> loc(#loc334)
|
| 148 |
+
%tmp14 = arith.remsi %m, %cst_2 : tensor<128x1xi32> loc(#loc335)
|
| 149 |
+
%tmp14_89 = arith.cmpi ne, %tmp14, %cst_8 : tensor<128x1xi32> loc(#loc336)
|
| 150 |
+
%tmp14_90 = arith.divsi %m, %cst_2 : tensor<128x1xi32> loc(#loc337)
|
| 151 |
+
%tmp14_91 = arith.subi %tmp14_90, %cst_7 : tensor<128x1xi32> loc(#loc338)
|
| 152 |
+
%tmp14_92 = arith.select %tmp14_89, %tmp14_91, %tmp14_90 : tensor<128x1xi1>, tensor<128x1xi32> loc(#loc339)
|
| 153 |
+
%tmp14_93 = arith.select %tmp3, %tmp14_92, %tmp14_90 : tensor<128x1xi1>, tensor<128x1xi32> loc(#loc340)
|
| 154 |
+
%tmp16 = arith.remsi %n_79, %cst_1 : tensor<1x64xi32> loc(#loc341)
|
| 155 |
+
%tmp16_94 = arith.cmpi ne, %tmp16, %cst_9 : tensor<1x64xi32> loc(#loc342)
|
| 156 |
+
%tmp16_95 = arith.divsi %n_79, %cst_1 : tensor<1x64xi32> loc(#loc343)
|
| 157 |
+
%tmp16_96 = arith.subi %tmp16_95, %cst_6 : tensor<1x64xi32> loc(#loc344)
|
| 158 |
+
%tmp16_97 = arith.select %tmp16_94, %tmp16_96, %tmp16_95 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc345)
|
| 159 |
+
%tmp16_98 = arith.select %tmp8, %tmp16_97, %tmp16_95 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc346)
|
| 160 |
+
%tmp17 = tt.broadcast %tmp14_93 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc347)
|
| 161 |
+
%tmp17_99 = tt.broadcast %tmp16_98 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc347)
|
| 162 |
+
%tmp17_100 = arith.cmpi eq, %tmp17, %tmp17_99 : tensor<128x64xi32> loc(#loc347)
|
| 163 |
+
%tmp18 = arith.andi %tmp11_88, %tmp17_100 : tensor<128x64xi1> loc(#loc348)
|
| 164 |
+
%tmp19 = arith.ori %tmp9_86, %tmp18 : tensor<128x64xi1> loc(#loc349)
|
| 165 |
+
%tmp20 = arith.ori %tmp6_84, %tmp19 : tensor<128x64xi1> loc(#loc350)
|
| 166 |
+
%mask_mod_output = arith.select %post_mod_scores_80, %tmp20, %cst_5 : tensor<128x64xi1>, tensor<128x64xi1> loc(#loc351)
|
| 167 |
+
%post_mod_scores_101 = arith.select %mask_mod_output, %post_mod_scores_81, %cst_10 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc352)
|
| 168 |
+
%post_mod_scores_102 = arith.mulf %post_mod_scores_101, %cst_4 : tensor<128x64xf32> loc(#loc353)
|
| 169 |
+
%m_ij = "tt.reduce"(%post_mod_scores_102) <{axis = 1 : i32}> ({
|
| 170 |
+
^bb0(%m_ij_135: f32 loc(callsite(#loc1 at #loc354)), %m_ij_136: f32 loc(callsite(#loc1 at #loc354))):
|
| 171 |
+
%m_ij_137 = arith.maxnumf %m_ij_135, %m_ij_136 : f32 loc(#loc467)
|
| 172 |
+
tt.reduce.return %m_ij_137 : f32 loc(#loc412)
|
| 173 |
+
}) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc412)
|
| 174 |
+
%m_ij_103 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc355)
|
| 175 |
+
%masked_out_rows = arith.cmpf oeq, %m_ij_103, %cst_3 : tensor<128xf32> loc(#loc356)
|
| 176 |
+
%m_ij_masked = arith.select %masked_out_rows, %cst_14, %m_ij_103 : tensor<128xi1>, tensor<128xf32> loc(#loc357)
|
| 177 |
+
%alpha = arith.subf %m_i, %m_ij_masked : tensor<128xf32> loc(#loc358)
|
| 178 |
+
%alpha_104 = math.exp2 %alpha : tensor<128xf32> loc(#loc359)
|
| 179 |
+
%p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc360)
|
| 180 |
+
%p_105 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc361)
|
| 181 |
+
%p_106 = arith.subf %post_mod_scores_102, %p_105 : tensor<128x64xf32> loc(#loc361)
|
| 182 |
+
%p_107 = math.exp2 %p_106 : tensor<128x64xf32> loc(#loc362)
|
| 183 |
+
%l_i_108 = arith.mulf %l_i_63, %alpha_104 : tensor<128xf32> loc(#loc363)
|
| 184 |
+
%l_i_109 = "tt.reduce"(%p_107) <{axis = 1 : i32}> ({
|
| 185 |
+
^bb0(%l_i_135: f32 loc(callsite(#loc1 at #loc364)), %l_i_136: f32 loc(callsite(#loc1 at #loc364))):
|
| 186 |
+
%l_i_137 = arith.addf %l_i_135, %l_i_136 : f32 loc(#loc468)
|
| 187 |
+
tt.reduce.return %l_i_137 : f32 loc(#loc414)
|
| 188 |
+
}) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc414)
|
| 189 |
+
%l_i_110 = arith.addf %l_i_108, %l_i_109 : tensor<128xf32> loc(#loc365)
|
| 190 |
+
%acc_111 = tt.expand_dims %alpha_104 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc366)
|
| 191 |
+
%acc_112 = tt.broadcast %acc_111 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc367)
|
| 192 |
+
%acc_113 = arith.mulf %acc_62, %acc_112 : tensor<128x128xf32> loc(#loc367)
|
| 193 |
+
%ptr_114 = tt.splat %V : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc416)
|
| 194 |
+
%ptr_115 = tt.addptr %ptr_114, %ptr_68 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc416)
|
| 195 |
+
%ptr_116 = tt.broadcast %ptr_115 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc417)
|
| 196 |
+
%ptr_117 = tt.addptr %ptr_116, %ptr_72 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc417)
|
| 197 |
+
%v = tt.load %ptr_117, %k_75, %cst_0 : tensor<64x128x!tt.ptr<bf16>> loc(#loc418)
|
| 198 |
+
%acc_118 = arith.truncf %p_107 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc369)
|
| 199 |
+
%acc_119 = tt.dot %acc_118, %v, %acc_113, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc370)
|
| 200 |
+
%cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc419)
|
| 201 |
+
%cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr<i32>, i32 loc(#loc420)
|
| 202 |
+
%cur_block_120 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc421)
|
| 203 |
+
%next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc422)
|
| 204 |
+
%next_block_121 = arith.cmpi slt, %next_block, %kv_num_blocks_32 : i32 loc(#loc423)
|
| 205 |
+
%next_block_122 = tt.addptr %cur_block, %c1_i32 : !tt.ptr<i32>, i32 loc(#loc424)
|
| 206 |
+
%next_block_123 = tt.load %next_block_122, %next_block_121 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc425)
|
| 207 |
+
%needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc426)
|
| 208 |
+
%needs_jump_124 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc427)
|
| 209 |
+
%needs_jump_125 = arith.cmpi eq, %needs_jump_124, %c0_i32 : i32 loc(#loc428)
|
| 210 |
+
%jump_to_block = arith.subi %next_block_123, %cur_block_120 : i32 loc(#loc429)
|
| 211 |
+
%jump_to_block_126 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc430)
|
| 212 |
+
%jump_to_block_127 = arith.subi %jump_to_block_126, %c64_i32 : i32 loc(#loc431)
|
| 213 |
+
%offset = arith.extui %needs_jump_125 : i1 to i32 loc(#loc432)
|
| 214 |
+
%offset_128 = arith.muli %jump_to_block_127, %offset : i32 loc(#loc432)
|
| 215 |
+
%offset_129 = arith.subi %c1_i32, %offset : i32 loc(#loc433)
|
| 216 |
+
%offset_130 = arith.muli %offset_129, %c64_i32 : i32 loc(#loc434)
|
| 217 |
+
%offset_131 = arith.addi %offset_128, %offset_130 : i32 loc(#loc435)
|
| 218 |
+
%offs_n_132 = tt.splat %offset_131 : i32 -> tensor<1x64xi32> loc(#loc372)
|
| 219 |
+
%offs_n_133 = arith.addi %offs_n_64, %offs_n_132 : tensor<1x64xi32> loc(#loc372)
|
| 220 |
+
%kv_offset_134 = arith.addi %kv_offset_65, %offset_131 : i32 loc(#loc373)
|
| 221 |
+
scf.yield %acc_119, %l_i_110, %m_ij_103, %offs_n_133, %kv_offset_134 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc284)
|
| 222 |
+
} loc(#loc472)
|
| 223 |
+
%kv_indices_39 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset : !tt.ptr<i32>, i32 loc(#loc285)
|
| 224 |
+
%kv_start_40 = tt.load %kv_indices_39 : !tt.ptr<i32> loc(#loc286)
|
| 225 |
+
%kv_start_41 = arith.muli %kv_start_40, %c128_i32 : i32 loc(#loc287)
|
| 226 |
+
%kv_num_blocks_42 = tt.addptr %arg_FULL_KV_NUM_BLKS, %q_start : !tt.ptr<i32>, i32 loc(#loc288)
|
| 227 |
+
%kv_num_blocks_43 = tt.load %kv_num_blocks_42 : !tt.ptr<i32> loc(#loc289)
|
| 228 |
+
%block_n_end_44 = arith.muli %kv_num_blocks_43, %c2_i32 : i32 loc(#loc290)
|
| 229 |
+
%block_n_end_45 = arith.minsi %block_n_end_44, %block_n_end_35 : i32 loc(#loc291)
|
| 230 |
+
%offs_n_46 = tt.splat %kv_start_41 : i32 -> tensor<64xi32> loc(#loc292)
|
| 231 |
+
%offs_n_47 = arith.addi %offs_n_46, %offs_n : tensor<64xi32> loc(#loc292)
|
| 232 |
+
%2 = tt.expand_dims %offs_n_47 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc136)
|
| 233 |
+
%kv_offset_48:5 = scf.for %start_n = %c0_i32 to %block_n_end_45 step %c1_i32 iter_args(%acc_62 = %kv_offset#0, %l_i_63 = %kv_offset#1, %m_i = %kv_offset#2, %offs_n_64 = %2, %kv_offset_65 = %c0_i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 {
|
| 234 |
+
%kv_base_offset = arith.addi %kv_start_41, %kv_offset_65 : i32 loc(#loc374)
|
| 235 |
+
%offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc375)
|
| 236 |
+
%offs_n_load_66 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc375)
|
| 237 |
+
%ptr_67 = tt.expand_dims %offs_n_load_66 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc436)
|
| 238 |
+
%ptr_68 = arith.muli %ptr_67, %cst : tensor<64x1xi32> loc(#loc437)
|
| 239 |
+
%ptr_69 = tt.splat %K : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc438)
|
| 240 |
+
%ptr_70 = tt.addptr %ptr_69, %ptr_68 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc438)
|
| 241 |
+
%ptr_71 = tt.broadcast %ptr_70 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc439)
|
| 242 |
+
%ptr_72 = tt.broadcast %ptr_23 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc439)
|
| 243 |
+
%ptr_73 = tt.addptr %ptr_71, %ptr_72 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc439)
|
| 244 |
+
%k = tt.splat %ks1 : i32 -> tensor<64x1xi32> loc(#loc440)
|
| 245 |
+
%k_74 = arith.cmpi slt, %ptr_67, %k : tensor<64x1xi32> loc(#loc440)
|
| 246 |
+
%k_75 = tt.broadcast %k_74 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc441)
|
| 247 |
+
%k_76 = tt.load %ptr_73, %k_75, %cst_0 : tensor<64x128x!tt.ptr<bf16>> loc(#loc441)
|
| 248 |
+
%k_77 = tt.trans %k_76 {order = array<i32: 1, 0>} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc377)
|
| 249 |
+
%qk = tt.dot %q_30, %k_77, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc378)
|
| 250 |
+
%qk_78 = arith.mulf %qk, %cst_11 : tensor<128x64xf32> loc(#loc379)
|
| 251 |
+
%post_mod_scores = tt.splat %ks1 : i32 -> tensor<1x64xi32> loc(#loc380)
|
| 252 |
+
%post_mod_scores_79 = arith.cmpi slt, %offs_n_64, %post_mod_scores : tensor<1x64xi32> loc(#loc380)
|
| 253 |
+
%post_mod_scores_80 = tt.broadcast %post_mod_scores_79 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc381)
|
| 254 |
+
%post_mod_scores_81 = arith.select %post_mod_scores_80, %qk_78, %cst_10 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc381)
|
| 255 |
+
%post_mod_scores_82 = arith.mulf %post_mod_scores_81, %cst_4 : tensor<128x64xf32> loc(#loc382)
|
| 256 |
+
%m_ij = "tt.reduce"(%post_mod_scores_82) <{axis = 1 : i32}> ({
|
| 257 |
+
^bb0(%m_ij_115: f32 loc(callsite(#loc1 at #loc383)), %m_ij_116: f32 loc(callsite(#loc1 at #loc383))):
|
| 258 |
+
%m_ij_117 = arith.maxnumf %m_ij_115, %m_ij_116 : f32 loc(#loc469)
|
| 259 |
+
tt.reduce.return %m_ij_117 : f32 loc(#loc442)
|
| 260 |
+
}) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc442)
|
| 261 |
+
%m_ij_83 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc384)
|
| 262 |
+
%masked_out_rows = arith.cmpf oeq, %m_ij_83, %cst_3 : tensor<128xf32> loc(#loc385)
|
| 263 |
+
%m_ij_masked = arith.select %masked_out_rows, %cst_14, %m_ij_83 : tensor<128xi1>, tensor<128xf32> loc(#loc386)
|
| 264 |
+
%alpha = arith.subf %m_i, %m_ij_masked : tensor<128xf32> loc(#loc387)
|
| 265 |
+
%alpha_84 = math.exp2 %alpha : tensor<128xf32> loc(#loc388)
|
| 266 |
+
%p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc389)
|
| 267 |
+
%p_85 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc390)
|
| 268 |
+
%p_86 = arith.subf %post_mod_scores_82, %p_85 : tensor<128x64xf32> loc(#loc390)
|
| 269 |
+
%p_87 = math.exp2 %p_86 : tensor<128x64xf32> loc(#loc391)
|
| 270 |
+
%l_i_88 = arith.mulf %l_i_63, %alpha_84 : tensor<128xf32> loc(#loc392)
|
| 271 |
+
%l_i_89 = "tt.reduce"(%p_87) <{axis = 1 : i32}> ({
|
| 272 |
+
^bb0(%l_i_115: f32 loc(callsite(#loc1 at #loc393)), %l_i_116: f32 loc(callsite(#loc1 at #loc393))):
|
| 273 |
+
%l_i_117 = arith.addf %l_i_115, %l_i_116 : f32 loc(#loc470)
|
| 274 |
+
tt.reduce.return %l_i_117 : f32 loc(#loc444)
|
| 275 |
+
}) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc444)
|
| 276 |
+
%l_i_90 = arith.addf %l_i_88, %l_i_89 : tensor<128xf32> loc(#loc394)
|
| 277 |
+
%acc_91 = tt.expand_dims %alpha_84 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc395)
|
| 278 |
+
%acc_92 = tt.broadcast %acc_91 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc396)
|
| 279 |
+
%acc_93 = arith.mulf %acc_62, %acc_92 : tensor<128x128xf32> loc(#loc396)
|
| 280 |
+
%ptr_94 = tt.splat %V : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc446)
|
| 281 |
+
%ptr_95 = tt.addptr %ptr_94, %ptr_68 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc446)
|
| 282 |
+
%ptr_96 = tt.broadcast %ptr_95 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc447)
|
| 283 |
+
%ptr_97 = tt.addptr %ptr_96, %ptr_72 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc447)
|
| 284 |
+
%v = tt.load %ptr_97, %k_75, %cst_0 : tensor<64x128x!tt.ptr<bf16>> loc(#loc448)
|
| 285 |
+
%acc_98 = arith.truncf %p_87 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc398)
|
| 286 |
+
%acc_99 = tt.dot %acc_98, %v, %acc_93, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc399)
|
| 287 |
+
%cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc449)
|
| 288 |
+
%cur_block = tt.addptr %kv_indices_39, %cur_block_idx : !tt.ptr<i32>, i32 loc(#loc450)
|
| 289 |
+
%cur_block_100 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc451)
|
| 290 |
+
%next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc452)
|
| 291 |
+
%next_block_101 = arith.cmpi slt, %next_block, %kv_num_blocks_43 : i32 loc(#loc453)
|
| 292 |
+
%next_block_102 = tt.addptr %cur_block, %c1_i32 : !tt.ptr<i32>, i32 loc(#loc454)
|
| 293 |
+
%next_block_103 = tt.load %next_block_102, %next_block_101 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc455)
|
| 294 |
+
%needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc456)
|
| 295 |
+
%needs_jump_104 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc457)
|
| 296 |
+
%needs_jump_105 = arith.cmpi eq, %needs_jump_104, %c0_i32 : i32 loc(#loc458)
|
| 297 |
+
%jump_to_block = arith.subi %next_block_103, %cur_block_100 : i32 loc(#loc459)
|
| 298 |
+
%jump_to_block_106 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc460)
|
| 299 |
+
%jump_to_block_107 = arith.subi %jump_to_block_106, %c64_i32 : i32 loc(#loc461)
|
| 300 |
+
%offset = arith.extui %needs_jump_105 : i1 to i32 loc(#loc462)
|
| 301 |
+
%offset_108 = arith.muli %jump_to_block_107, %offset : i32 loc(#loc462)
|
| 302 |
+
%offset_109 = arith.subi %c1_i32, %offset : i32 loc(#loc463)
|
| 303 |
+
%offset_110 = arith.muli %offset_109, %c64_i32 : i32 loc(#loc464)
|
| 304 |
+
%offset_111 = arith.addi %offset_108, %offset_110 : i32 loc(#loc465)
|
| 305 |
+
%offs_n_112 = tt.splat %offset_111 : i32 -> tensor<1x64xi32> loc(#loc401)
|
| 306 |
+
%offs_n_113 = arith.addi %offs_n_64, %offs_n_112 : tensor<1x64xi32> loc(#loc401)
|
| 307 |
+
%kv_offset_114 = arith.addi %kv_offset_65, %offset_111 : i32 loc(#loc402)
|
| 308 |
+
scf.yield %acc_99, %l_i_90, %m_ij_83, %offs_n_113, %kv_offset_114 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc294)
|
| 309 |
+
} loc(#loc473)
|
| 310 |
+
%l_i_49 = arith.cmpf oeq, %kv_offset_48#1, %cst_14 : tensor<128xf32> loc(#loc295)
|
| 311 |
+
%l_i_50 = arith.select %l_i_49, %l_i, %kv_offset_48#1 : tensor<128xi1>, tensor<128xf32> loc(#loc175)
|
| 312 |
+
%acc_51 = tt.expand_dims %l_i_50 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc296)
|
| 313 |
+
%acc_52 = tt.broadcast %acc_51 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc297)
|
| 314 |
+
%acc_53 = arith.divf %kv_offset_48#0, %acc_52 : tensor<128x128xf32> loc(#loc297)
|
| 315 |
+
%mask_54 = arith.cmpi slt, %ptr_23, %mask : tensor<1x128xi32> loc(#loc174)
|
| 316 |
+
%mask_55 = tt.broadcast %mask_54 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc298)
|
| 317 |
+
%mask_56 = arith.andi %q_29, %mask_55 : tensor<128x128xi1> loc(#loc298)
|
| 318 |
+
%3 = tt.splat %q_offset_15 : i32 -> tensor<1x128xi32> loc(#loc142)
|
| 319 |
+
%4 = arith.addi %ptr_23, %3 : tensor<1x128xi32> loc(#loc142)
|
| 320 |
+
%5 = tt.broadcast %4 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc143)
|
| 321 |
+
%6 = tt.broadcast %ptr_20 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc143)
|
| 322 |
+
%7 = arith.addi %5, %6 : tensor<128x128xi32> loc(#loc143)
|
| 323 |
+
%8 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<128x128x!tt.ptr<bf16>> loc(#loc144)
|
| 324 |
+
%9 = tt.addptr %8, %7 : tensor<128x128x!tt.ptr<bf16>>, tensor<128x128xi32> loc(#loc144)
|
| 325 |
+
%10 = arith.truncf %acc_53 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc145)
|
| 326 |
+
tt.store %9, %10, %mask_56 : tensor<128x128x!tt.ptr<bf16>> loc(#loc145)
|
| 327 |
+
%off_hz = arith.muli %off_zq, %HQ : i32 loc(#loc299)
|
| 328 |
+
%off_hz_57 = arith.addi %off_hz, %off_hq : i32 loc(#loc300)
|
| 329 |
+
%l_ptrs = arith.muli %off_hz_57, %ks0 : i32 loc(#loc301)
|
| 330 |
+
%l_ptrs_58 = tt.addptr %arg_LSE, %l_ptrs : !tt.ptr<f32>, i32 loc(#loc302)
|
| 331 |
+
%l_ptrs_59 = tt.splat %l_ptrs_58 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>> loc(#loc303)
|
| 332 |
+
%l_ptrs_60 = tt.addptr %l_ptrs_59, %offs_m_19 : tensor<128x!tt.ptr<f32>>, tensor<128xi32> loc(#loc303)
|
| 333 |
+
%lse = math.log2 %l_i_50 : tensor<128xf32> loc(#loc304)
|
| 334 |
+
%lse_61 = arith.addf %kv_offset_48#2, %lse : tensor<128xf32> loc(#loc305)
|
| 335 |
+
%11 = tt.splat %ks0 : i32 -> tensor<128xi32> loc(#loc153)
|
| 336 |
+
%12 = arith.cmpi slt, %offs_m_19, %11 : tensor<128xi32> loc(#loc153)
|
| 337 |
+
tt.store %l_ptrs_60, %lse_61, %12 : tensor<128x!tt.ptr<f32>> loc(#loc154)
|
| 338 |
+
tt.return loc(#loc155)
|
| 339 |
+
} loc(#loc)
|
| 340 |
+
} loc(#loc)
|
| 341 |
+
#loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":292:23)
|
| 342 |
+
#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":146:101)
|
| 343 |
+
#loc5 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31)
|
| 344 |
+
#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":136:19)
|
| 345 |
+
#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":214:38)
|
| 346 |
+
#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":206:34)
|
| 347 |
+
#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":90:9)
|
| 348 |
+
#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":85:54)
|
| 349 |
+
#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":97:28)
|
| 350 |
+
#loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":98:27)
|
| 351 |
+
#loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":99:27)
|
| 352 |
+
#loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":104:24)
|
| 353 |
+
#loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":107:24)
|
| 354 |
+
#loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":107:45)
|
| 355 |
+
#loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":107:36)
|
| 356 |
+
#loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":108:47)
|
| 357 |
+
#loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":111:12)
|
| 358 |
+
#loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":112:12)
|
| 359 |
+
#loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":113:12)
|
| 360 |
+
#loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":143:97)
|
| 361 |
+
#loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":144:23)
|
| 362 |
+
#loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":144:46)
|
| 363 |
+
#loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":144:33)
|
| 364 |
+
#loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":284:27)
|
| 365 |
+
#loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":284:38)
|
| 366 |
+
#loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":284:20)
|
| 367 |
+
#loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":284:56)
|
| 368 |
+
#loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":284:49)
|
| 369 |
+
#loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":292:52)
|
| 370 |
+
#loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":151:26)
|
| 371 |
+
#loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":152:23)
|
| 372 |
+
#loc34 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":152:37)
|
| 373 |
+
#loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":153:42)
|
| 374 |
+
#loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":153:28)
|
| 375 |
+
#loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":154:45)
|
| 376 |
+
#loc38 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22)
|
| 377 |
+
#loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":154:92)
|
| 378 |
+
#loc40 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28)
|
| 379 |
+
#loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":154:102)
|
| 380 |
+
#loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":154:65)
|
| 381 |
+
#loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":159:37)
|
| 382 |
+
#loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":159:24)
|
| 383 |
+
#loc45 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":167:48)
|
| 384 |
+
#loc46 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":484:40)
|
| 385 |
+
#loc47 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":342:32)
|
| 386 |
+
#loc49 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":346:35)
|
| 387 |
+
#loc50 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":347:107)
|
| 388 |
+
#loc51 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":349:17)
|
| 389 |
+
#loc52 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":351:19)
|
| 390 |
+
#loc53 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":353:14)
|
| 391 |
+
#loc54 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":257:21)
|
| 392 |
+
#loc55 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":358:36)
|
| 393 |
+
#loc56 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":359:36)
|
| 394 |
+
#loc57 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":367:44)
|
| 395 |
+
#loc58 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":367:69)
|
| 396 |
+
#loc59 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":372:22)
|
| 397 |
+
#loc60 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":374:23)
|
| 398 |
+
#loc61 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":375:22)
|
| 399 |
+
#loc62 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":376:23)
|
| 400 |
+
#loc63 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":377:22)
|
| 401 |
+
#loc64 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":378:22)
|
| 402 |
+
#loc65 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":379:24)
|
| 403 |
+
#loc66 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":380:23)
|
| 404 |
+
#loc67 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":383:70)
|
| 405 |
+
#loc68 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":383:79)
|
| 406 |
+
#loc69 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":383:91)
|
| 407 |
+
#loc70 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":383:99)
|
| 408 |
+
#loc71 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":383:102)
|
| 409 |
+
#loc72 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":383:119)
|
| 410 |
+
#loc73 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":385:70)
|
| 411 |
+
#loc74 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":385:79)
|
| 412 |
+
#loc75 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":385:91)
|
| 413 |
+
#loc76 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":385:99)
|
| 414 |
+
#loc77 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":385:102)
|
| 415 |
+
#loc78 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":385:119)
|
| 416 |
+
#loc79 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":386:25)
|
| 417 |
+
#loc80 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":387:24)
|
| 418 |
+
#loc81 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":388:23)
|
| 419 |
+
#loc82 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":389:23)
|
| 420 |
+
#loc83 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":394:73)
|
| 421 |
+
#loc84 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":396:69)
|
| 422 |
+
#loc85 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":399:27)
|
| 423 |
+
#loc86 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40)
|
| 424 |
+
#loc88 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27)
|
| 425 |
+
#loc89 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":403:27)
|
| 426 |
+
#loc90 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":405:35)
|
| 427 |
+
#loc91 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":406:51)
|
| 428 |
+
#loc92 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":410:31)
|
| 429 |
+
#loc93 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":410:25)
|
| 430 |
+
#loc94 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":411:51)
|
| 431 |
+
#loc95 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":411:39)
|
| 432 |
+
#loc96 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":411:21)
|
| 433 |
+
#loc97 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":416:16)
|
| 434 |
+
#loc98 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 435 |
+
#loc100 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 436 |
+
#loc101 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":416:24)
|
| 437 |
+
#loc102 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":418:22)
|
| 438 |
+
#loc103 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":418:16)
|
| 439 |
+
#loc104 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":421:107)
|
| 440 |
+
#loc105 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":422:22)
|
| 441 |
+
#loc106 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":422:44)
|
| 442 |
+
#loc107 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":247:33)
|
| 443 |
+
#loc108 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":527:63)
|
| 444 |
+
#loc109 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":248:38)
|
| 445 |
+
#loc110 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":248:24)
|
| 446 |
+
#loc111 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":249:109)
|
| 447 |
+
#loc112 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":249:113)
|
| 448 |
+
#loc113 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":249:55)
|
| 449 |
+
#loc114 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":249:25)
|
| 450 |
+
#loc115 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":250:30)
|
| 451 |
+
#loc116 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":250:35)
|
| 452 |
+
#loc117 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":250:60)
|
| 453 |
+
#loc118 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":251:34)
|
| 454 |
+
#loc119 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":251:48)
|
| 455 |
+
#loc120 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":251:63)
|
| 456 |
+
#loc121 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":252:29)
|
| 457 |
+
#loc122 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":252:47)
|
| 458 |
+
#loc123 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":252:61)
|
| 459 |
+
#loc124 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":252:42)
|
| 460 |
+
#loc125 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":530:26)
|
| 461 |
+
#loc126 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":531:21)
|
| 462 |
+
#loc127 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":531:8)
|
| 463 |
+
#loc128 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":181:35)
|
| 464 |
+
#loc129 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":182:27)
|
| 465 |
+
#loc130 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":182:41)
|
| 466 |
+
#loc131 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":183:51)
|
| 467 |
+
#loc132 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":183:32)
|
| 468 |
+
#loc133 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":184:49)
|
| 469 |
+
#loc134 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":184:69)
|
| 470 |
+
#loc135 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":186:28)
|
| 471 |
+
#loc136 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":193:52)
|
| 472 |
+
#loc138 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":206:26)
|
| 473 |
+
#loc139 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":208:20)
|
| 474 |
+
#loc140 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":208:16)
|
| 475 |
+
#loc141 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":214:30)
|
| 476 |
+
#loc142 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":218:49)
|
| 477 |
+
#loc143 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":218:62)
|
| 478 |
+
#loc144 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":218:25)
|
| 479 |
+
#loc145 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":218:92)
|
| 480 |
+
#loc146 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":221:26)
|
| 481 |
+
#loc147 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":221:31)
|
| 482 |
+
#loc148 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":222:32)
|
| 483 |
+
#loc149 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":222:23)
|
| 484 |
+
#loc150 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":222:40)
|
| 485 |
+
#loc151 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":223:33)
|
| 486 |
+
#loc152 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":223:20)
|
| 487 |
+
#loc153 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":227:48)
|
| 488 |
+
#loc154 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":227:29)
|
| 489 |
+
#loc155 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/fs/cfs2of6ygpacl7vcs74vy37xwnw4avrqcbqon6ht56dira6khkkl.py":229:4)
|
| 490 |
+
#loc171 = loc(callsite(#loc1 at #loc2))
|
| 491 |
+
#loc172 = loc("q"(#loc4))
|
| 492 |
+
#loc173 = loc("acc"(#loc6))
|
| 493 |
+
#loc174 = loc("mask"(#loc7))
|
| 494 |
+
#loc175 = loc("l_i"(#loc8))
|
| 495 |
+
#loc176 = loc("HQ"(#loc9))
|
| 496 |
+
#loc177 = loc("q_start"(#loc11))
|
| 497 |
+
#loc178 = loc("off_zq"(#loc12))
|
| 498 |
+
#loc179 = loc("off_hq"(#loc13))
|
| 499 |
+
#loc180 = loc("off_hkv"(#loc14))
|
| 500 |
+
#loc181 = loc("q_offset"(#loc15))
|
| 501 |
+
#loc182 = loc("q_offset"(#loc16))
|
| 502 |
+
#loc183 = loc("q_offset"(#loc17))
|
| 503 |
+
#loc184 = loc("k_offset"(#loc18))
|
| 504 |
+
#loc185 = loc("Q"(#loc19))
|
| 505 |
+
#loc186 = loc("K"(#loc20))
|
| 506 |
+
#loc187 = loc("V"(#loc21))
|
| 507 |
+
#loc188 = loc("sparse_kv_idx_offset"(#loc22))
|
| 508 |
+
#loc189 = loc("offs_m"(#loc23))
|
| 509 |
+
#loc190 = loc("offs_m"(#loc24))
|
| 510 |
+
#loc191 = loc("offs_m"(#loc25))
|
| 511 |
+
#loc192 = loc("ptr"(#loc26))
|
| 512 |
+
#loc193 = loc("ptr"(#loc27))
|
| 513 |
+
#loc194 = loc("ptr"(#loc28))
|
| 514 |
+
#loc195 = loc("ptr"(#loc29))
|
| 515 |
+
#loc196 = loc("ptr"(#loc30))
|
| 516 |
+
#loc197 = loc("kv_indices"(#loc32))
|
| 517 |
+
#loc198 = loc("kv_start"(#loc33))
|
| 518 |
+
#loc199 = loc("kv_start"(#loc34))
|
| 519 |
+
#loc200 = loc("kv_num_blocks"(#loc35))
|
| 520 |
+
#loc201 = loc("kv_num_blocks"(#loc36))
|
| 521 |
+
#loc202 = loc("block_n_end"(#loc37))
|
| 522 |
+
#loc203 = loc("block_n_end"(#loc39))
|
| 523 |
+
#loc204 = loc("block_n_end"(#loc41))
|
| 524 |
+
#loc205 = loc("block_n_end"(#loc42))
|
| 525 |
+
#loc206 = loc("offs_n"(#loc43))
|
| 526 |
+
#loc207 = loc("offs_n"(#loc44))
|
| 527 |
+
#loc208 = loc("acc"(#loc46))
|
| 528 |
+
#loc209 = loc("kv_base_offset"(#loc47))
|
| 529 |
+
#loc211 = loc("offs_n_load"(#loc49))
|
| 530 |
+
#loc212 = loc("k"(#loc50))
|
| 531 |
+
#loc213 = loc("k"(#loc51))
|
| 532 |
+
#loc214 = loc("qk"(#loc52))
|
| 533 |
+
#loc215 = loc("qk"(#loc53))
|
| 534 |
+
#loc216 = loc("m"(#loc55))
|
| 535 |
+
#loc217 = loc("n"(#loc56))
|
| 536 |
+
#loc218 = loc("post_mod_scores"(#loc57))
|
| 537 |
+
#loc219 = loc("post_mod_scores"(#loc58))
|
| 538 |
+
#loc220 = loc("tmp3"(#loc59))
|
| 539 |
+
#loc221 = loc("tmp5"(#loc60))
|
| 540 |
+
#loc222 = loc("tmp6"(#loc61))
|
| 541 |
+
#loc223 = loc("tmp7"(#loc62))
|
| 542 |
+
#loc224 = loc("tmp8"(#loc63))
|
| 543 |
+
#loc225 = loc("tmp9"(#loc64))
|
| 544 |
+
#loc226 = loc("tmp10"(#loc65))
|
| 545 |
+
#loc227 = loc("tmp11"(#loc66))
|
| 546 |
+
#loc228 = loc("tmp14"(#loc67))
|
| 547 |
+
#loc229 = loc("tmp14"(#loc68))
|
| 548 |
+
#loc230 = loc("tmp14"(#loc69))
|
| 549 |
+
#loc231 = loc("tmp14"(#loc70))
|
| 550 |
+
#loc232 = loc("tmp14"(#loc71))
|
| 551 |
+
#loc233 = loc("tmp14"(#loc72))
|
| 552 |
+
#loc234 = loc("tmp16"(#loc73))
|
| 553 |
+
#loc235 = loc("tmp16"(#loc74))
|
| 554 |
+
#loc236 = loc("tmp16"(#loc75))
|
| 555 |
+
#loc237 = loc("tmp16"(#loc76))
|
| 556 |
+
#loc238 = loc("tmp16"(#loc77))
|
| 557 |
+
#loc239 = loc("tmp16"(#loc78))
|
| 558 |
+
#loc240 = loc("tmp17"(#loc79))
|
| 559 |
+
#loc241 = loc("tmp18"(#loc80))
|
| 560 |
+
#loc242 = loc("tmp19"(#loc81))
|
| 561 |
+
#loc243 = loc("tmp20"(#loc82))
|
| 562 |
+
#loc244 = loc("mask_mod_output"(#loc83))
|
| 563 |
+
#loc245 = loc("post_mod_scores"(#loc84))
|
| 564 |
+
#loc246 = loc("post_mod_scores"(#loc85))
|
| 565 |
+
#loc248 = loc("m_ij"(#loc89))
|
| 566 |
+
#loc249 = loc("masked_out_rows"(#loc90))
|
| 567 |
+
#loc250 = loc("m_ij_masked"(#loc91))
|
| 568 |
+
#loc251 = loc("alpha"(#loc92))
|
| 569 |
+
#loc252 = loc("alpha"(#loc93))
|
| 570 |
+
#loc253 = loc("p"(#loc94))
|
| 571 |
+
#loc254 = loc("p"(#loc95))
|
| 572 |
+
#loc255 = loc("p"(#loc96))
|
| 573 |
+
#loc256 = loc("l_i"(#loc97))
|
| 574 |
+
#loc258 = loc("l_i"(#loc101))
|
| 575 |
+
#loc259 = loc("acc"(#loc102))
|
| 576 |
+
#loc260 = loc("acc"(#loc103))
|
| 577 |
+
#loc261 = loc("v"(#loc104))
|
| 578 |
+
#loc262 = loc("acc"(#loc105))
|
| 579 |
+
#loc263 = loc("acc"(#loc106))
|
| 580 |
+
#loc264 = loc("cur_block_idx"(#loc107))
|
| 581 |
+
#loc265 = loc("offset"(#loc108))
|
| 582 |
+
#loc266 = loc("cur_block"(#loc109))
|
| 583 |
+
#loc267 = loc("cur_block"(#loc110))
|
| 584 |
+
#loc268 = loc("next_block"(#loc111))
|
| 585 |
+
#loc269 = loc("next_block"(#loc112))
|
| 586 |
+
#loc270 = loc("next_block"(#loc113))
|
| 587 |
+
#loc271 = loc("next_block"(#loc114))
|
| 588 |
+
#loc272 = loc("needs_jump"(#loc115))
|
| 589 |
+
#loc273 = loc("needs_jump"(#loc116))
|
| 590 |
+
#loc274 = loc("needs_jump"(#loc117))
|
| 591 |
+
#loc275 = loc("jump_to_block"(#loc118))
|
| 592 |
+
#loc276 = loc("jump_to_block"(#loc119))
|
| 593 |
+
#loc277 = loc("jump_to_block"(#loc120))
|
| 594 |
+
#loc278 = loc("offset"(#loc121))
|
| 595 |
+
#loc279 = loc("offset"(#loc122))
|
| 596 |
+
#loc280 = loc("offset"(#loc123))
|
| 597 |
+
#loc281 = loc("offset"(#loc124))
|
| 598 |
+
#loc282 = loc("offs_n"(#loc125))
|
| 599 |
+
#loc283 = loc("kv_offset"(#loc126))
|
| 600 |
+
#loc284 = loc(callsite(#loc127 at #loc2))
|
| 601 |
+
#loc285 = loc("kv_indices"(#loc128))
|
| 602 |
+
#loc286 = loc("kv_start"(#loc129))
|
| 603 |
+
#loc287 = loc("kv_start"(#loc130))
|
| 604 |
+
#loc288 = loc("kv_num_blocks"(#loc131))
|
| 605 |
+
#loc289 = loc("kv_num_blocks"(#loc132))
|
| 606 |
+
#loc290 = loc("block_n_end"(#loc133))
|
| 607 |
+
#loc291 = loc("block_n_end"(#loc134))
|
| 608 |
+
#loc292 = loc("offs_n"(#loc135))
|
| 609 |
+
#loc294 = loc(callsite(#loc127 at #loc137))
|
| 610 |
+
#loc295 = loc("l_i"(#loc138))
|
| 611 |
+
#loc296 = loc("acc"(#loc139))
|
| 612 |
+
#loc297 = loc("acc"(#loc140))
|
| 613 |
+
#loc298 = loc("mask"(#loc141))
|
| 614 |
+
#loc299 = loc("off_hz"(#loc146))
|
| 615 |
+
#loc300 = loc("off_hz"(#loc147))
|
| 616 |
+
#loc301 = loc("l_ptrs"(#loc148))
|
| 617 |
+
#loc302 = loc("l_ptrs"(#loc149))
|
| 618 |
+
#loc303 = loc("l_ptrs"(#loc150))
|
| 619 |
+
#loc304 = loc("lse"(#loc151))
|
| 620 |
+
#loc305 = loc("lse"(#loc152))
|
| 621 |
+
#loc306 = loc(callsite(#loc3 at #loc172))
|
| 622 |
+
#loc307 = loc(callsite(#loc5 at #loc173))
|
| 623 |
+
#loc308 = loc(callsite(#loc192 at #loc172))
|
| 624 |
+
#loc309 = loc(callsite(#loc193 at #loc172))
|
| 625 |
+
#loc310 = loc(callsite(#loc194 at #loc172))
|
| 626 |
+
#loc311 = loc(callsite(#loc195 at #loc172))
|
| 627 |
+
#loc312 = loc(callsite(#loc196 at #loc172))
|
| 628 |
+
#loc313 = loc(callsite(#loc31 at #loc172))
|
| 629 |
+
#loc314 = loc(callsite(#loc38 at #loc203))
|
| 630 |
+
#loc315 = loc(callsite(#loc40 at #loc203))
|
| 631 |
+
#loc316 = loc("l_i"(#loc208))
|
| 632 |
+
#loc317 = loc(callsite(#loc209 at #loc210))
|
| 633 |
+
#loc318 = loc(callsite(#loc211 at #loc210))
|
| 634 |
+
#loc319 = loc(callsite(#loc212 at #loc210))
|
| 635 |
+
#loc320 = loc(callsite(#loc213 at #loc210))
|
| 636 |
+
#loc321 = loc(callsite(#loc214 at #loc210))
|
| 637 |
+
#loc322 = loc(callsite(#loc215 at #loc210))
|
| 638 |
+
#loc323 = loc(callsite(#loc216 at #loc210))
|
| 639 |
+
#loc324 = loc(callsite(#loc217 at #loc210))
|
| 640 |
+
#loc325 = loc(callsite(#loc218 at #loc210))
|
| 641 |
+
#loc326 = loc(callsite(#loc219 at #loc210))
|
| 642 |
+
#loc327 = loc(callsite(#loc220 at #loc210))
|
| 643 |
+
#loc328 = loc(callsite(#loc221 at #loc210))
|
| 644 |
+
#loc329 = loc(callsite(#loc222 at #loc210))
|
| 645 |
+
#loc330 = loc(callsite(#loc223 at #loc210))
|
| 646 |
+
#loc331 = loc(callsite(#loc224 at #loc210))
|
| 647 |
+
#loc332 = loc(callsite(#loc225 at #loc210))
|
| 648 |
+
#loc333 = loc(callsite(#loc226 at #loc210))
|
| 649 |
+
#loc334 = loc(callsite(#loc227 at #loc210))
|
| 650 |
+
#loc335 = loc(callsite(#loc228 at #loc210))
|
| 651 |
+
#loc336 = loc(callsite(#loc229 at #loc210))
|
| 652 |
+
#loc337 = loc(callsite(#loc230 at #loc210))
|
| 653 |
+
#loc338 = loc(callsite(#loc231 at #loc210))
|
| 654 |
+
#loc339 = loc(callsite(#loc232 at #loc210))
|
| 655 |
+
#loc340 = loc(callsite(#loc233 at #loc210))
|
| 656 |
+
#loc341 = loc(callsite(#loc234 at #loc210))
|
| 657 |
+
#loc342 = loc(callsite(#loc235 at #loc210))
|
| 658 |
+
#loc343 = loc(callsite(#loc236 at #loc210))
|
| 659 |
+
#loc344 = loc(callsite(#loc237 at #loc210))
|
| 660 |
+
#loc345 = loc(callsite(#loc238 at #loc210))
|
| 661 |
+
#loc346 = loc(callsite(#loc239 at #loc210))
|
| 662 |
+
#loc347 = loc(callsite(#loc240 at #loc210))
|
| 663 |
+
#loc348 = loc(callsite(#loc241 at #loc210))
|
| 664 |
+
#loc349 = loc(callsite(#loc242 at #loc210))
|
| 665 |
+
#loc350 = loc(callsite(#loc243 at #loc210))
|
| 666 |
+
#loc351 = loc(callsite(#loc244 at #loc210))
|
| 667 |
+
#loc352 = loc(callsite(#loc245 at #loc210))
|
| 668 |
+
#loc353 = loc(callsite(#loc246 at #loc210))
|
| 669 |
+
#loc355 = loc(callsite(#loc248 at #loc210))
|
| 670 |
+
#loc356 = loc(callsite(#loc249 at #loc210))
|
| 671 |
+
#loc357 = loc(callsite(#loc250 at #loc210))
|
| 672 |
+
#loc358 = loc(callsite(#loc251 at #loc210))
|
| 673 |
+
#loc359 = loc(callsite(#loc252 at #loc210))
|
| 674 |
+
#loc360 = loc(callsite(#loc253 at #loc210))
|
| 675 |
+
#loc361 = loc(callsite(#loc254 at #loc210))
|
| 676 |
+
#loc362 = loc(callsite(#loc255 at #loc210))
|
| 677 |
+
#loc363 = loc(callsite(#loc256 at #loc210))
|
| 678 |
+
#loc365 = loc(callsite(#loc258 at #loc210))
|
| 679 |
+
#loc366 = loc(callsite(#loc259 at #loc210))
|
| 680 |
+
#loc367 = loc(callsite(#loc260 at #loc210))
|
| 681 |
+
#loc368 = loc(callsite(#loc261 at #loc210))
|
| 682 |
+
#loc369 = loc(callsite(#loc262 at #loc210))
|
| 683 |
+
#loc370 = loc(callsite(#loc263 at #loc210))
|
| 684 |
+
#loc371 = loc(callsite(#loc265 at #loc2))
|
| 685 |
+
#loc372 = loc(callsite(#loc282 at #loc2))
|
| 686 |
+
#loc373 = loc(callsite(#loc283 at #loc2))
|
| 687 |
+
#loc374 = loc(callsite(#loc209 at #loc293))
|
| 688 |
+
#loc375 = loc(callsite(#loc211 at #loc293))
|
| 689 |
+
#loc376 = loc(callsite(#loc212 at #loc293))
|
| 690 |
+
#loc377 = loc(callsite(#loc213 at #loc293))
|
| 691 |
+
#loc378 = loc(callsite(#loc214 at #loc293))
|
| 692 |
+
#loc379 = loc(callsite(#loc215 at #loc293))
|
| 693 |
+
#loc380 = loc(callsite(#loc218 at #loc293))
|
| 694 |
+
#loc381 = loc(callsite(#loc219 at #loc293))
|
| 695 |
+
#loc382 = loc(callsite(#loc246 at #loc293))
|
| 696 |
+
#loc384 = loc(callsite(#loc248 at #loc293))
|
| 697 |
+
#loc385 = loc(callsite(#loc249 at #loc293))
|
| 698 |
+
#loc386 = loc(callsite(#loc250 at #loc293))
|
| 699 |
+
#loc387 = loc(callsite(#loc251 at #loc293))
|
| 700 |
+
#loc388 = loc(callsite(#loc252 at #loc293))
|
| 701 |
+
#loc389 = loc(callsite(#loc253 at #loc293))
|
| 702 |
+
#loc390 = loc(callsite(#loc254 at #loc293))
|
| 703 |
+
#loc391 = loc(callsite(#loc255 at #loc293))
|
| 704 |
+
#loc392 = loc(callsite(#loc256 at #loc293))
|
| 705 |
+
#loc394 = loc(callsite(#loc258 at #loc293))
|
| 706 |
+
#loc395 = loc(callsite(#loc259 at #loc293))
|
| 707 |
+
#loc396 = loc(callsite(#loc260 at #loc293))
|
| 708 |
+
#loc397 = loc(callsite(#loc261 at #loc293))
|
| 709 |
+
#loc398 = loc(callsite(#loc262 at #loc293))
|
| 710 |
+
#loc399 = loc(callsite(#loc263 at #loc293))
|
| 711 |
+
#loc400 = loc(callsite(#loc265 at #loc137))
|
| 712 |
+
#loc401 = loc(callsite(#loc282 at #loc137))
|
| 713 |
+
#loc402 = loc(callsite(#loc283 at #loc137))
|
| 714 |
+
#loc403 = loc("m_i"(#loc316))
|
| 715 |
+
#loc404 = loc(callsite(#loc192 at #loc319))
|
| 716 |
+
#loc405 = loc(callsite(#loc193 at #loc319))
|
| 717 |
+
#loc406 = loc(callsite(#loc194 at #loc319))
|
| 718 |
+
#loc407 = loc(callsite(#loc196 at #loc319))
|
| 719 |
+
#loc408 = loc(callsite(#loc31 at #loc319))
|
| 720 |
+
#loc409 = loc(callsite(#loc3 at #loc319))
|
| 721 |
+
#loc410 = loc(callsite(#loc54 at #loc323))
|
| 722 |
+
#loc411 = loc(callsite(#loc54 at #loc324))
|
| 723 |
+
#loc412 = loc(callsite(#loc86 at #loc354))
|
| 724 |
+
#loc414 = loc(callsite(#loc98 at #loc364))
|
| 725 |
+
#loc416 = loc(callsite(#loc194 at #loc368))
|
| 726 |
+
#loc417 = loc(callsite(#loc196 at #loc368))
|
| 727 |
+
#loc418 = loc(callsite(#loc3 at #loc368))
|
| 728 |
+
#loc419 = loc(callsite(#loc264 at #loc371))
|
| 729 |
+
#loc420 = loc(callsite(#loc266 at #loc371))
|
| 730 |
+
#loc421 = loc(callsite(#loc267 at #loc371))
|
| 731 |
+
#loc422 = loc(callsite(#loc268 at #loc371))
|
| 732 |
+
#loc423 = loc(callsite(#loc269 at #loc371))
|
| 733 |
+
#loc424 = loc(callsite(#loc270 at #loc371))
|
| 734 |
+
#loc425 = loc(callsite(#loc271 at #loc371))
|
| 735 |
+
#loc426 = loc(callsite(#loc272 at #loc371))
|
| 736 |
+
#loc427 = loc(callsite(#loc273 at #loc371))
|
| 737 |
+
#loc428 = loc(callsite(#loc274 at #loc371))
|
| 738 |
+
#loc429 = loc(callsite(#loc275 at #loc371))
|
| 739 |
+
#loc430 = loc(callsite(#loc276 at #loc371))
|
| 740 |
+
#loc431 = loc(callsite(#loc277 at #loc371))
|
| 741 |
+
#loc432 = loc(callsite(#loc278 at #loc371))
|
| 742 |
+
#loc433 = loc(callsite(#loc279 at #loc371))
|
| 743 |
+
#loc434 = loc(callsite(#loc280 at #loc371))
|
| 744 |
+
#loc435 = loc(callsite(#loc281 at #loc371))
|
| 745 |
+
#loc436 = loc(callsite(#loc192 at #loc376))
|
| 746 |
+
#loc437 = loc(callsite(#loc193 at #loc376))
|
| 747 |
+
#loc438 = loc(callsite(#loc194 at #loc376))
|
| 748 |
+
#loc439 = loc(callsite(#loc196 at #loc376))
|
| 749 |
+
#loc440 = loc(callsite(#loc31 at #loc376))
|
| 750 |
+
#loc441 = loc(callsite(#loc3 at #loc376))
|
| 751 |
+
#loc442 = loc(callsite(#loc86 at #loc383))
|
| 752 |
+
#loc444 = loc(callsite(#loc98 at #loc393))
|
| 753 |
+
#loc446 = loc(callsite(#loc194 at #loc397))
|
| 754 |
+
#loc447 = loc(callsite(#loc196 at #loc397))
|
| 755 |
+
#loc448 = loc(callsite(#loc3 at #loc397))
|
| 756 |
+
#loc449 = loc(callsite(#loc264 at #loc400))
|
| 757 |
+
#loc450 = loc(callsite(#loc266 at #loc400))
|
| 758 |
+
#loc451 = loc(callsite(#loc267 at #loc400))
|
| 759 |
+
#loc452 = loc(callsite(#loc268 at #loc400))
|
| 760 |
+
#loc453 = loc(callsite(#loc269 at #loc400))
|
| 761 |
+
#loc454 = loc(callsite(#loc270 at #loc400))
|
| 762 |
+
#loc455 = loc(callsite(#loc271 at #loc400))
|
| 763 |
+
#loc456 = loc(callsite(#loc272 at #loc400))
|
| 764 |
+
#loc457 = loc(callsite(#loc273 at #loc400))
|
| 765 |
+
#loc458 = loc(callsite(#loc274 at #loc400))
|
| 766 |
+
#loc459 = loc(callsite(#loc275 at #loc400))
|
| 767 |
+
#loc460 = loc(callsite(#loc276 at #loc400))
|
| 768 |
+
#loc461 = loc(callsite(#loc277 at #loc400))
|
| 769 |
+
#loc462 = loc(callsite(#loc278 at #loc400))
|
| 770 |
+
#loc463 = loc(callsite(#loc279 at #loc400))
|
| 771 |
+
#loc464 = loc(callsite(#loc280 at #loc400))
|
| 772 |
+
#loc465 = loc(callsite(#loc281 at #loc400))
|
| 773 |
+
#loc466 = loc("offs_n"(#loc403))
|
| 774 |
+
#loc467 = loc(callsite(#loc88 at #loc412))
|
| 775 |
+
#loc468 = loc(callsite(#loc100 at #loc414))
|
| 776 |
+
#loc469 = loc(callsite(#loc88 at #loc442))
|
| 777 |
+
#loc470 = loc(callsite(#loc100 at #loc444))
|
| 778 |
+
#loc471 = loc("kv_offset"(#loc466))
|
| 779 |
+
#loc472 = loc(callsite(#loc471 at #loc2))
|
| 780 |
+
#loc473 = loc(callsite(#loc471 at #loc137))
|
progress/SpecForge/cache/compiled_kernels/triton/2/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/__grp__triton_per_fused_mul_1.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"child_paths": {"triton_per_fused_mul_1.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.source", "triton_per_fused_mul_1.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.ttir", "triton_per_fused_mul_1.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.ttgir", "triton_per_fused_mul_1.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.llir", "triton_per_fused_mul_1.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.ptx", "triton_per_fused_mul_1.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.cubin", "triton_per_fused_mul_1.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.json"}}
|
progress/SpecForge/cache/compiled_kernels/triton/2/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.cubin
ADDED
|
Binary file (40.6 kB). View file
|
|
|
progress/SpecForge/cache/compiled_kernels/triton/2/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"hash": "d74899755dbce272e791805e3b63e07d55dd638854536aa9def087e90d782d2d", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 4096, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused_mul_1"}
|
progress/SpecForge/cache/compiled_kernels/triton/2/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.llir
ADDED
|
@@ -0,0 +1,1190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
; ModuleID = 'LLVMDialectModule'
|
| 2 |
+
source_filename = "LLVMDialectModule"
|
| 3 |
+
target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
| 4 |
+
|
| 5 |
+
@global_smem = external addrspace(3) global [0 x i8], align 16
|
| 6 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
| 7 |
+
|
| 8 |
+
; Function Attrs: nounwind
|
| 9 |
+
define ptx_kernel void @triton_per_fused_mul_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i64 %5, i32 %6, i32 %7, ptr addrspace(1) readnone captures(none) %8, ptr addrspace(1) readnone captures(none) %9) local_unnamed_addr #0 !dbg !5 {
|
| 10 |
+
%11 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8
|
| 11 |
+
%12 = shl i32 %11, 7, !dbg !9
|
| 12 |
+
%13 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
| 13 |
+
%14 = lshr i32 %13, 5, !dbg !10
|
| 14 |
+
%15 = and i32 %13, 31, !dbg !10
|
| 15 |
+
%16 = shl nuw nsw i32 %15, 2, !dbg !10
|
| 16 |
+
%17 = or disjoint i32 %16, %12, !dbg !11
|
| 17 |
+
%18 = icmp slt i32 %17, %6, !dbg !12
|
| 18 |
+
%19 = and i32 %14, 7, !dbg !13
|
| 19 |
+
%20 = or disjoint i32 %19, 8, !dbg !13
|
| 20 |
+
%21 = or disjoint i32 %19, 16, !dbg !13
|
| 21 |
+
%22 = or i32 %14, 24, !dbg !13
|
| 22 |
+
%23 = sext i32 %17 to i64, !dbg !14
|
| 23 |
+
%24 = shl i64 %5, 5, !dbg !15
|
| 24 |
+
%25 = zext nneg i32 %19 to i64, !dbg !16
|
| 25 |
+
%26 = zext nneg i32 %20 to i64, !dbg !16
|
| 26 |
+
%27 = zext nneg i32 %21 to i64, !dbg !16
|
| 27 |
+
%28 = zext nneg i32 %22 to i64, !dbg !16
|
| 28 |
+
%29 = mul i64 %24, %25, !dbg !16
|
| 29 |
+
%30 = mul i64 %24, %26, !dbg !16
|
| 30 |
+
%31 = mul i64 %24, %27, !dbg !16
|
| 31 |
+
%32 = mul i64 %24, %28, !dbg !16
|
| 32 |
+
%33 = add i64 %29, %23, !dbg !17
|
| 33 |
+
%34 = add i64 %30, %23, !dbg !17
|
| 34 |
+
%35 = add i64 %31, %23, !dbg !17
|
| 35 |
+
%36 = add i64 %32, %23, !dbg !17
|
| 36 |
+
%37 = getelementptr float, ptr addrspace(1) %0, i64 %33, !dbg !18
|
| 37 |
+
%38 = getelementptr float, ptr addrspace(1) %0, i64 %34, !dbg !18
|
| 38 |
+
%39 = getelementptr float, ptr addrspace(1) %0, i64 %35, !dbg !18
|
| 39 |
+
%40 = getelementptr float, ptr addrspace(1) %0, i64 %36, !dbg !18
|
| 40 |
+
%41 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %37, i1 %18) #6, !dbg !19
|
| 41 |
+
%42 = extractvalue { i32, i32, i32, i32 } %41, 0, !dbg !19
|
| 42 |
+
%43 = extractvalue { i32, i32, i32, i32 } %41, 1, !dbg !19
|
| 43 |
+
%44 = extractvalue { i32, i32, i32, i32 } %41, 2, !dbg !19
|
| 44 |
+
%45 = extractvalue { i32, i32, i32, i32 } %41, 3, !dbg !19
|
| 45 |
+
%46 = bitcast i32 %42 to float, !dbg !19
|
| 46 |
+
%47 = bitcast i32 %43 to float, !dbg !19
|
| 47 |
+
%48 = bitcast i32 %44 to float, !dbg !19
|
| 48 |
+
%49 = bitcast i32 %45 to float, !dbg !19
|
| 49 |
+
%50 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %38, i1 %18) #6, !dbg !19
|
| 50 |
+
%51 = extractvalue { i32, i32, i32, i32 } %50, 0, !dbg !19
|
| 51 |
+
%52 = extractvalue { i32, i32, i32, i32 } %50, 1, !dbg !19
|
| 52 |
+
%53 = extractvalue { i32, i32, i32, i32 } %50, 2, !dbg !19
|
| 53 |
+
%54 = extractvalue { i32, i32, i32, i32 } %50, 3, !dbg !19
|
| 54 |
+
%55 = bitcast i32 %51 to float, !dbg !19
|
| 55 |
+
%56 = bitcast i32 %52 to float, !dbg !19
|
| 56 |
+
%57 = bitcast i32 %53 to float, !dbg !19
|
| 57 |
+
%58 = bitcast i32 %54 to float, !dbg !19
|
| 58 |
+
%59 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %39, i1 %18) #6, !dbg !19
|
| 59 |
+
%60 = extractvalue { i32, i32, i32, i32 } %59, 0, !dbg !19
|
| 60 |
+
%61 = extractvalue { i32, i32, i32, i32 } %59, 1, !dbg !19
|
| 61 |
+
%62 = extractvalue { i32, i32, i32, i32 } %59, 2, !dbg !19
|
| 62 |
+
%63 = extractvalue { i32, i32, i32, i32 } %59, 3, !dbg !19
|
| 63 |
+
%64 = bitcast i32 %60 to float, !dbg !19
|
| 64 |
+
%65 = bitcast i32 %61 to float, !dbg !19
|
| 65 |
+
%66 = bitcast i32 %62 to float, !dbg !19
|
| 66 |
+
%67 = bitcast i32 %63 to float, !dbg !19
|
| 67 |
+
%68 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %40, i1 %18) #6, !dbg !19
|
| 68 |
+
%69 = extractvalue { i32, i32, i32, i32 } %68, 0, !dbg !19
|
| 69 |
+
%70 = extractvalue { i32, i32, i32, i32 } %68, 1, !dbg !19
|
| 70 |
+
%71 = extractvalue { i32, i32, i32, i32 } %68, 2, !dbg !19
|
| 71 |
+
%72 = extractvalue { i32, i32, i32, i32 } %68, 3, !dbg !19
|
| 72 |
+
%73 = bitcast i32 %69 to float, !dbg !19
|
| 73 |
+
%74 = bitcast i32 %70 to float, !dbg !19
|
| 74 |
+
%75 = bitcast i32 %71 to float, !dbg !19
|
| 75 |
+
%76 = bitcast i32 %72 to float, !dbg !19
|
| 76 |
+
%77 = getelementptr float, ptr addrspace(1) %1, i64 %33, !dbg !20
|
| 77 |
+
%78 = getelementptr float, ptr addrspace(1) %1, i64 %34, !dbg !20
|
| 78 |
+
%79 = getelementptr float, ptr addrspace(1) %1, i64 %35, !dbg !20
|
| 79 |
+
%80 = getelementptr float, ptr addrspace(1) %1, i64 %36, !dbg !20
|
| 80 |
+
%81 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %77, i1 %18) #6, !dbg !21
|
| 81 |
+
%82 = extractvalue { i32, i32, i32, i32 } %81, 0, !dbg !21
|
| 82 |
+
%83 = extractvalue { i32, i32, i32, i32 } %81, 1, !dbg !21
|
| 83 |
+
%84 = extractvalue { i32, i32, i32, i32 } %81, 2, !dbg !21
|
| 84 |
+
%85 = extractvalue { i32, i32, i32, i32 } %81, 3, !dbg !21
|
| 85 |
+
%86 = bitcast i32 %82 to float, !dbg !21
|
| 86 |
+
%87 = bitcast i32 %83 to float, !dbg !21
|
| 87 |
+
%88 = bitcast i32 %84 to float, !dbg !21
|
| 88 |
+
%89 = bitcast i32 %85 to float, !dbg !21
|
| 89 |
+
%90 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %78, i1 %18) #6, !dbg !21
|
| 90 |
+
%91 = extractvalue { i32, i32, i32, i32 } %90, 0, !dbg !21
|
| 91 |
+
%92 = extractvalue { i32, i32, i32, i32 } %90, 1, !dbg !21
|
| 92 |
+
%93 = extractvalue { i32, i32, i32, i32 } %90, 2, !dbg !21
|
| 93 |
+
%94 = extractvalue { i32, i32, i32, i32 } %90, 3, !dbg !21
|
| 94 |
+
%95 = bitcast i32 %91 to float, !dbg !21
|
| 95 |
+
%96 = bitcast i32 %92 to float, !dbg !21
|
| 96 |
+
%97 = bitcast i32 %93 to float, !dbg !21
|
| 97 |
+
%98 = bitcast i32 %94 to float, !dbg !21
|
| 98 |
+
%99 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %79, i1 %18) #6, !dbg !21
|
| 99 |
+
%100 = extractvalue { i32, i32, i32, i32 } %99, 0, !dbg !21
|
| 100 |
+
%101 = extractvalue { i32, i32, i32, i32 } %99, 1, !dbg !21
|
| 101 |
+
%102 = extractvalue { i32, i32, i32, i32 } %99, 2, !dbg !21
|
| 102 |
+
%103 = extractvalue { i32, i32, i32, i32 } %99, 3, !dbg !21
|
| 103 |
+
%104 = bitcast i32 %100 to float, !dbg !21
|
| 104 |
+
%105 = bitcast i32 %101 to float, !dbg !21
|
| 105 |
+
%106 = bitcast i32 %102 to float, !dbg !21
|
| 106 |
+
%107 = bitcast i32 %103 to float, !dbg !21
|
| 107 |
+
%108 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %80, i1 %18) #6, !dbg !21
|
| 108 |
+
%109 = extractvalue { i32, i32, i32, i32 } %108, 0, !dbg !21
|
| 109 |
+
%110 = extractvalue { i32, i32, i32, i32 } %108, 1, !dbg !21
|
| 110 |
+
%111 = extractvalue { i32, i32, i32, i32 } %108, 2, !dbg !21
|
| 111 |
+
%112 = extractvalue { i32, i32, i32, i32 } %108, 3, !dbg !21
|
| 112 |
+
%113 = bitcast i32 %109 to float, !dbg !21
|
| 113 |
+
%114 = bitcast i32 %110 to float, !dbg !21
|
| 114 |
+
%115 = bitcast i32 %111 to float, !dbg !21
|
| 115 |
+
%116 = bitcast i32 %112 to float, !dbg !21
|
| 116 |
+
%117 = select i1 %18, float %46, float 0xFFF0000000000000, !dbg !22
|
| 117 |
+
%118 = select i1 %18, float %47, float 0xFFF0000000000000, !dbg !22
|
| 118 |
+
%119 = select i1 %18, float %48, float 0xFFF0000000000000, !dbg !22
|
| 119 |
+
%120 = select i1 %18, float %49, float 0xFFF0000000000000, !dbg !22
|
| 120 |
+
%121 = select i1 %18, float %55, float 0xFFF0000000000000, !dbg !22
|
| 121 |
+
%122 = select i1 %18, float %56, float 0xFFF0000000000000, !dbg !22
|
| 122 |
+
%123 = select i1 %18, float %57, float 0xFFF0000000000000, !dbg !22
|
| 123 |
+
%124 = select i1 %18, float %58, float 0xFFF0000000000000, !dbg !22
|
| 124 |
+
%125 = select i1 %18, float %64, float 0xFFF0000000000000, !dbg !22
|
| 125 |
+
%126 = select i1 %18, float %65, float 0xFFF0000000000000, !dbg !22
|
| 126 |
+
%127 = select i1 %18, float %66, float 0xFFF0000000000000, !dbg !22
|
| 127 |
+
%128 = select i1 %18, float %67, float 0xFFF0000000000000, !dbg !22
|
| 128 |
+
%129 = select i1 %18, float %73, float 0xFFF0000000000000, !dbg !22
|
| 129 |
+
%130 = select i1 %18, float %74, float 0xFFF0000000000000, !dbg !22
|
| 130 |
+
%131 = select i1 %18, float %75, float 0xFFF0000000000000, !dbg !22
|
| 131 |
+
%132 = select i1 %18, float %76, float 0xFFF0000000000000, !dbg !22
|
| 132 |
+
%133 = fcmp ogt float %117, %121, !dbg !23
|
| 133 |
+
%134 = fcmp uno float %117, 0.000000e+00, !dbg !27
|
| 134 |
+
%135 = or i1 %134, %133, !dbg !28
|
| 135 |
+
%136 = select i1 %135, float %117, float %121, !dbg !29
|
| 136 |
+
%137 = fcmp ogt float %118, %122, !dbg !23
|
| 137 |
+
%138 = fcmp uno float %118, 0.000000e+00, !dbg !27
|
| 138 |
+
%139 = or i1 %138, %137, !dbg !28
|
| 139 |
+
%140 = select i1 %139, float %118, float %122, !dbg !29
|
| 140 |
+
%141 = fcmp ogt float %119, %123, !dbg !23
|
| 141 |
+
%142 = fcmp uno float %119, 0.000000e+00, !dbg !27
|
| 142 |
+
%143 = or i1 %142, %141, !dbg !28
|
| 143 |
+
%144 = select i1 %143, float %119, float %123, !dbg !29
|
| 144 |
+
%145 = fcmp ogt float %120, %124, !dbg !23
|
| 145 |
+
%146 = fcmp uno float %120, 0.000000e+00, !dbg !27
|
| 146 |
+
%147 = or i1 %146, %145, !dbg !28
|
| 147 |
+
%148 = select i1 %147, float %120, float %124, !dbg !29
|
| 148 |
+
%149 = fcmp ogt float %136, %125, !dbg !23
|
| 149 |
+
%150 = fcmp uno float %136, 0.000000e+00, !dbg !27
|
| 150 |
+
%151 = or i1 %149, %150, !dbg !28
|
| 151 |
+
%152 = select i1 %151, float %136, float %125, !dbg !29
|
| 152 |
+
%153 = fcmp ogt float %140, %126, !dbg !23
|
| 153 |
+
%154 = fcmp uno float %140, 0.000000e+00, !dbg !27
|
| 154 |
+
%155 = or i1 %153, %154, !dbg !28
|
| 155 |
+
%156 = select i1 %155, float %140, float %126, !dbg !29
|
| 156 |
+
%157 = fcmp ogt float %144, %127, !dbg !23
|
| 157 |
+
%158 = fcmp uno float %144, 0.000000e+00, !dbg !27
|
| 158 |
+
%159 = or i1 %157, %158, !dbg !28
|
| 159 |
+
%160 = select i1 %159, float %144, float %127, !dbg !29
|
| 160 |
+
%161 = fcmp ogt float %148, %128, !dbg !23
|
| 161 |
+
%162 = fcmp uno float %148, 0.000000e+00, !dbg !27
|
| 162 |
+
%163 = or i1 %161, %162, !dbg !28
|
| 163 |
+
%164 = select i1 %163, float %148, float %128, !dbg !29
|
| 164 |
+
%165 = fcmp ogt float %152, %129, !dbg !23
|
| 165 |
+
%166 = fcmp uno float %152, 0.000000e+00, !dbg !27
|
| 166 |
+
%167 = or i1 %165, %166, !dbg !28
|
| 167 |
+
%168 = select i1 %167, float %152, float %129, !dbg !29
|
| 168 |
+
%169 = fcmp ogt float %156, %130, !dbg !23
|
| 169 |
+
%170 = fcmp uno float %156, 0.000000e+00, !dbg !27
|
| 170 |
+
%171 = or i1 %169, %170, !dbg !28
|
| 171 |
+
%172 = select i1 %171, float %156, float %130, !dbg !29
|
| 172 |
+
%173 = fcmp ogt float %160, %131, !dbg !23
|
| 173 |
+
%174 = fcmp uno float %160, 0.000000e+00, !dbg !27
|
| 174 |
+
%175 = or i1 %173, %174, !dbg !28
|
| 175 |
+
%176 = select i1 %175, float %160, float %131, !dbg !29
|
| 176 |
+
%177 = fcmp ogt float %164, %132, !dbg !23
|
| 177 |
+
%178 = fcmp uno float %164, 0.000000e+00, !dbg !27
|
| 178 |
+
%179 = or i1 %177, %178, !dbg !28
|
| 179 |
+
%180 = select i1 %179, float %164, float %132, !dbg !29
|
| 180 |
+
%.idx = shl nuw nsw i32 %15, 7, !dbg !30
|
| 181 |
+
%181 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx, !dbg !30
|
| 182 |
+
%182 = getelementptr float, ptr addrspace(3) %181, i32 %19, !dbg !30
|
| 183 |
+
%183 = bitcast float %168 to <1 x i32>, !dbg !30
|
| 184 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %182, <1 x i32> %183, i1 true) #6, !dbg !30
|
| 185 |
+
%184 = getelementptr i8, ptr addrspace(3) %181, i32 32, !dbg !30
|
| 186 |
+
%185 = getelementptr float, ptr addrspace(3) %184, i32 %19, !dbg !30
|
| 187 |
+
%186 = bitcast float %172 to <1 x i32>, !dbg !30
|
| 188 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %185, <1 x i32> %186, i1 true) #6, !dbg !30
|
| 189 |
+
%187 = getelementptr i8, ptr addrspace(3) %181, i32 64, !dbg !30
|
| 190 |
+
%188 = getelementptr float, ptr addrspace(3) %187, i32 %19, !dbg !30
|
| 191 |
+
%189 = bitcast float %176 to <1 x i32>, !dbg !30
|
| 192 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %189, i1 true) #6, !dbg !30
|
| 193 |
+
%190 = getelementptr i8, ptr addrspace(3) %181, i32 96, !dbg !30
|
| 194 |
+
%191 = getelementptr float, ptr addrspace(3) %190, i32 %19, !dbg !30
|
| 195 |
+
%192 = bitcast float %180 to <1 x i32>, !dbg !30
|
| 196 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %192, i1 true) #6, !dbg !30
|
| 197 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30
|
| 198 |
+
%193 = getelementptr float, ptr addrspace(3) @global_smem, i32 %13, !dbg !30
|
| 199 |
+
%194 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %193, i1 true) #6, !dbg !30
|
| 200 |
+
%195 = bitcast i32 %194 to float, !dbg !30
|
| 201 |
+
%196 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %194, i32 4, i32 31), !dbg !30
|
| 202 |
+
%197 = bitcast i32 %196 to float, !dbg !30
|
| 203 |
+
%198 = fcmp ogt float %195, %197, !dbg !23
|
| 204 |
+
%199 = fcmp uno float %195, 0.000000e+00, !dbg !27
|
| 205 |
+
%200 = or i1 %199, %198, !dbg !28
|
| 206 |
+
%201 = select i1 %200, float %195, float %197, !dbg !29
|
| 207 |
+
%202 = bitcast float %201 to i32, !dbg !30
|
| 208 |
+
%203 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %202, i32 2, i32 31), !dbg !30
|
| 209 |
+
%204 = bitcast i32 %203 to float, !dbg !30
|
| 210 |
+
%205 = fcmp ogt float %201, %204, !dbg !23
|
| 211 |
+
%206 = fcmp uno float %201, 0.000000e+00, !dbg !27
|
| 212 |
+
%207 = or i1 %205, %206, !dbg !28
|
| 213 |
+
%208 = select i1 %207, float %201, float %204, !dbg !29
|
| 214 |
+
%209 = bitcast float %208 to i32, !dbg !30
|
| 215 |
+
%210 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %209, i32 1, i32 31), !dbg !30
|
| 216 |
+
%211 = bitcast i32 %210 to float, !dbg !30
|
| 217 |
+
%212 = fcmp ogt float %208, %211, !dbg !23
|
| 218 |
+
%213 = fcmp uno float %208, 0.000000e+00, !dbg !27
|
| 219 |
+
%214 = or i1 %212, %213, !dbg !28
|
| 220 |
+
%215 = and i32 %13, 7, !dbg !30
|
| 221 |
+
%216 = icmp eq i32 %215, 0, !dbg !30
|
| 222 |
+
%217 = select i1 %214, i32 %209, i32 %210, !dbg !29
|
| 223 |
+
%218 = insertelement <1 x i32> poison, i32 %217, i64 0, !dbg !30
|
| 224 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %193, <1 x i32> %218, i1 %216) #6, !dbg !30
|
| 225 |
+
%219 = getelementptr i8, ptr addrspace(3) %193, i32 1024, !dbg !30
|
| 226 |
+
%220 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %219, i1 true) #6, !dbg !30
|
| 227 |
+
%221 = bitcast i32 %220 to float, !dbg !30
|
| 228 |
+
%222 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %220, i32 4, i32 31), !dbg !30
|
| 229 |
+
%223 = bitcast i32 %222 to float, !dbg !30
|
| 230 |
+
%224 = fcmp ogt float %221, %223, !dbg !23
|
| 231 |
+
%225 = fcmp uno float %221, 0.000000e+00, !dbg !27
|
| 232 |
+
%226 = or i1 %225, %224, !dbg !28
|
| 233 |
+
%227 = select i1 %226, float %221, float %223, !dbg !29
|
| 234 |
+
%228 = bitcast float %227 to i32, !dbg !30
|
| 235 |
+
%229 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %228, i32 2, i32 31), !dbg !30
|
| 236 |
+
%230 = bitcast i32 %229 to float, !dbg !30
|
| 237 |
+
%231 = fcmp ogt float %227, %230, !dbg !23
|
| 238 |
+
%232 = fcmp uno float %227, 0.000000e+00, !dbg !27
|
| 239 |
+
%233 = or i1 %231, %232, !dbg !28
|
| 240 |
+
%234 = select i1 %233, float %227, float %230, !dbg !29
|
| 241 |
+
%235 = bitcast float %234 to i32, !dbg !30
|
| 242 |
+
%236 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %235, i32 1, i32 31), !dbg !30
|
| 243 |
+
%237 = bitcast i32 %236 to float, !dbg !30
|
| 244 |
+
%238 = fcmp ogt float %234, %237, !dbg !23
|
| 245 |
+
%239 = fcmp uno float %234, 0.000000e+00, !dbg !27
|
| 246 |
+
%240 = or i1 %238, %239, !dbg !28
|
| 247 |
+
%241 = select i1 %240, i32 %235, i32 %236, !dbg !29
|
| 248 |
+
%242 = insertelement <1 x i32> poison, i32 %241, i64 0, !dbg !30
|
| 249 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %219, <1 x i32> %242, i1 %216) #6, !dbg !30
|
| 250 |
+
%243 = getelementptr i8, ptr addrspace(3) %193, i32 2048, !dbg !30
|
| 251 |
+
%244 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %243, i1 true) #6, !dbg !30
|
| 252 |
+
%245 = bitcast i32 %244 to float, !dbg !30
|
| 253 |
+
%246 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %244, i32 4, i32 31), !dbg !30
|
| 254 |
+
%247 = bitcast i32 %246 to float, !dbg !30
|
| 255 |
+
%248 = fcmp ogt float %245, %247, !dbg !23
|
| 256 |
+
%249 = fcmp uno float %245, 0.000000e+00, !dbg !27
|
| 257 |
+
%250 = or i1 %249, %248, !dbg !28
|
| 258 |
+
%251 = select i1 %250, float %245, float %247, !dbg !29
|
| 259 |
+
%252 = bitcast float %251 to i32, !dbg !30
|
| 260 |
+
%253 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %252, i32 2, i32 31), !dbg !30
|
| 261 |
+
%254 = bitcast i32 %253 to float, !dbg !30
|
| 262 |
+
%255 = fcmp ogt float %251, %254, !dbg !23
|
| 263 |
+
%256 = fcmp uno float %251, 0.000000e+00, !dbg !27
|
| 264 |
+
%257 = or i1 %255, %256, !dbg !28
|
| 265 |
+
%258 = select i1 %257, float %251, float %254, !dbg !29
|
| 266 |
+
%259 = bitcast float %258 to i32, !dbg !30
|
| 267 |
+
%260 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %259, i32 1, i32 31), !dbg !30
|
| 268 |
+
%261 = bitcast i32 %260 to float, !dbg !30
|
| 269 |
+
%262 = fcmp ogt float %258, %261, !dbg !23
|
| 270 |
+
%263 = fcmp uno float %258, 0.000000e+00, !dbg !27
|
| 271 |
+
%264 = or i1 %262, %263, !dbg !28
|
| 272 |
+
%265 = select i1 %264, i32 %259, i32 %260, !dbg !29
|
| 273 |
+
%266 = insertelement <1 x i32> poison, i32 %265, i64 0, !dbg !30
|
| 274 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %243, <1 x i32> %266, i1 %216) #6, !dbg !30
|
| 275 |
+
%267 = getelementptr i8, ptr addrspace(3) %193, i32 3072, !dbg !30
|
| 276 |
+
%268 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %267, i1 true) #6, !dbg !30
|
| 277 |
+
%269 = bitcast i32 %268 to float, !dbg !30
|
| 278 |
+
%270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 4, i32 31), !dbg !30
|
| 279 |
+
%271 = bitcast i32 %270 to float, !dbg !30
|
| 280 |
+
%272 = fcmp ogt float %269, %271, !dbg !23
|
| 281 |
+
%273 = fcmp uno float %269, 0.000000e+00, !dbg !27
|
| 282 |
+
%274 = or i1 %273, %272, !dbg !28
|
| 283 |
+
%275 = select i1 %274, float %269, float %271, !dbg !29
|
| 284 |
+
%276 = bitcast float %275 to i32, !dbg !30
|
| 285 |
+
%277 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %276, i32 2, i32 31), !dbg !30
|
| 286 |
+
%278 = bitcast i32 %277 to float, !dbg !30
|
| 287 |
+
%279 = fcmp ogt float %275, %278, !dbg !23
|
| 288 |
+
%280 = fcmp uno float %275, 0.000000e+00, !dbg !27
|
| 289 |
+
%281 = or i1 %279, %280, !dbg !28
|
| 290 |
+
%282 = select i1 %281, float %275, float %278, !dbg !29
|
| 291 |
+
%283 = bitcast float %282 to i32, !dbg !30
|
| 292 |
+
%284 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %283, i32 1, i32 31), !dbg !30
|
| 293 |
+
%285 = bitcast i32 %284 to float, !dbg !30
|
| 294 |
+
%286 = fcmp ogt float %282, %285, !dbg !23
|
| 295 |
+
%287 = fcmp uno float %282, 0.000000e+00, !dbg !27
|
| 296 |
+
%288 = or i1 %286, %287, !dbg !28
|
| 297 |
+
%289 = select i1 %288, i32 %283, i32 %284, !dbg !29
|
| 298 |
+
%290 = insertelement <1 x i32> poison, i32 %289, i64 0, !dbg !30
|
| 299 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %267, <1 x i32> %290, i1 %216) #6, !dbg !30
|
| 300 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30
|
| 301 |
+
%291 = load float, ptr addrspace(3) %181, align 16, !dbg !30
|
| 302 |
+
%292 = load float, ptr addrspace(3) %184, align 16, !dbg !30
|
| 303 |
+
%293 = load float, ptr addrspace(3) %187, align 16, !dbg !30
|
| 304 |
+
%294 = load float, ptr addrspace(3) %190, align 16, !dbg !30
|
| 305 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !31
|
| 306 |
+
%295 = shl nuw nsw i32 %15, 4, !dbg !31
|
| 307 |
+
%296 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %295, !dbg !31
|
| 308 |
+
%297 = insertelement <4 x float> poison, float %291, i64 0, !dbg !31
|
| 309 |
+
%298 = insertelement <4 x float> %297, float %292, i64 1, !dbg !31
|
| 310 |
+
%299 = insertelement <4 x float> %298, float %293, i64 2, !dbg !31
|
| 311 |
+
%300 = insertelement <4 x float> %299, float %294, i64 3, !dbg !31
|
| 312 |
+
store <4 x float> %300, ptr addrspace(3) %296, align 16, !dbg !31
|
| 313 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !31
|
| 314 |
+
%301 = shl nuw nsw i32 %14, 7, !dbg !31
|
| 315 |
+
%302 = shl nuw nsw i32 %13, 4, !dbg !31
|
| 316 |
+
%303 = or i32 %301, %302, !dbg !31
|
| 317 |
+
%304 = and i32 %303, 496, !dbg !31
|
| 318 |
+
%305 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %304, !dbg !31
|
| 319 |
+
%306 = ptrtoint ptr addrspace(3) %305 to i32, !dbg !31
|
| 320 |
+
%307 = tail call i32 asm sideeffect "ldmatrix.sync.aligned.m8n8.x1.shared.b16 {$0}, [$1];", "=r,r"(i32 %306) #6, !dbg !31
|
| 321 |
+
%308 = fcmp oeq float %291, 0xFFF0000000000000, !dbg !32
|
| 322 |
+
%309 = fcmp oeq float %292, 0xFFF0000000000000, !dbg !32
|
| 323 |
+
%310 = fcmp oeq float %293, 0xFFF0000000000000, !dbg !32
|
| 324 |
+
%311 = fcmp oeq float %294, 0xFFF0000000000000, !dbg !32
|
| 325 |
+
%312 = fsub float %46, %291, !dbg !33
|
| 326 |
+
%313 = fsub float %47, %292, !dbg !33
|
| 327 |
+
%314 = fsub float %48, %293, !dbg !33
|
| 328 |
+
%315 = fsub float %49, %294, !dbg !33
|
| 329 |
+
%316 = fsub float %55, %291, !dbg !33
|
| 330 |
+
%317 = fsub float %56, %292, !dbg !33
|
| 331 |
+
%318 = fsub float %57, %293, !dbg !33
|
| 332 |
+
%319 = fsub float %58, %294, !dbg !33
|
| 333 |
+
%320 = fsub float %64, %291, !dbg !33
|
| 334 |
+
%321 = fsub float %65, %292, !dbg !33
|
| 335 |
+
%322 = fsub float %66, %293, !dbg !33
|
| 336 |
+
%323 = fsub float %67, %294, !dbg !33
|
| 337 |
+
%324 = fsub float %73, %291, !dbg !33
|
| 338 |
+
%325 = fsub float %74, %292, !dbg !33
|
| 339 |
+
%326 = fsub float %75, %293, !dbg !33
|
| 340 |
+
%327 = fsub float %76, %294, !dbg !33
|
| 341 |
+
%328 = select i1 %308, float 0.000000e+00, float %312, !dbg !34
|
| 342 |
+
%329 = select i1 %309, float 0.000000e+00, float %313, !dbg !34
|
| 343 |
+
%330 = select i1 %310, float 0.000000e+00, float %314, !dbg !34
|
| 344 |
+
%331 = select i1 %311, float 0.000000e+00, float %315, !dbg !34
|
| 345 |
+
%332 = select i1 %308, float 0.000000e+00, float %316, !dbg !34
|
| 346 |
+
%333 = select i1 %309, float 0.000000e+00, float %317, !dbg !34
|
| 347 |
+
%334 = select i1 %310, float 0.000000e+00, float %318, !dbg !34
|
| 348 |
+
%335 = select i1 %311, float 0.000000e+00, float %319, !dbg !34
|
| 349 |
+
%336 = select i1 %308, float 0.000000e+00, float %320, !dbg !34
|
| 350 |
+
%337 = select i1 %309, float 0.000000e+00, float %321, !dbg !34
|
| 351 |
+
%338 = select i1 %310, float 0.000000e+00, float %322, !dbg !34
|
| 352 |
+
%339 = select i1 %311, float 0.000000e+00, float %323, !dbg !34
|
| 353 |
+
%340 = select i1 %308, float 0.000000e+00, float %324, !dbg !34
|
| 354 |
+
%341 = select i1 %309, float 0.000000e+00, float %325, !dbg !34
|
| 355 |
+
%342 = select i1 %310, float 0.000000e+00, float %326, !dbg !34
|
| 356 |
+
%343 = select i1 %311, float 0.000000e+00, float %327, !dbg !34
|
| 357 |
+
%344 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
|
| 358 |
+
%.not.i = icmp eq i32 %344, 0, !dbg !35
|
| 359 |
+
br i1 %.not.i, label %347, label %345, !dbg !35
|
| 360 |
+
|
| 361 |
+
345: ; preds = %10
|
| 362 |
+
%346 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %328) #6, !dbg !35
|
| 363 |
+
br label %__nv_exp2f.exit, !dbg !35
|
| 364 |
+
|
| 365 |
+
347: ; preds = %10
|
| 366 |
+
%348 = tail call float @llvm.nvvm.ex2.approx.f(float %328) #6, !dbg !35
|
| 367 |
+
br label %__nv_exp2f.exit, !dbg !35
|
| 368 |
+
|
| 369 |
+
__nv_exp2f.exit: ; preds = %345, %347
|
| 370 |
+
%.0.i = phi float [ %346, %345 ], [ %348, %347 ], !dbg !35
|
| 371 |
+
%349 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
|
| 372 |
+
%.not.i1 = icmp eq i32 %349, 0, !dbg !35
|
| 373 |
+
br i1 %.not.i1, label %352, label %350, !dbg !35
|
| 374 |
+
|
| 375 |
+
350: ; preds = %__nv_exp2f.exit
|
| 376 |
+
%351 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %329) #6, !dbg !35
|
| 377 |
+
br label %__nv_exp2f.exit3, !dbg !35
|
| 378 |
+
|
| 379 |
+
352: ; preds = %__nv_exp2f.exit
|
| 380 |
+
%353 = tail call float @llvm.nvvm.ex2.approx.f(float %329) #6, !dbg !35
|
| 381 |
+
br label %__nv_exp2f.exit3, !dbg !35
|
| 382 |
+
|
| 383 |
+
__nv_exp2f.exit3: ; preds = %350, %352
|
| 384 |
+
%.0.i2 = phi float [ %351, %350 ], [ %353, %352 ], !dbg !35
|
| 385 |
+
%354 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
|
| 386 |
+
%.not.i4 = icmp eq i32 %354, 0, !dbg !35
|
| 387 |
+
br i1 %.not.i4, label %357, label %355, !dbg !35
|
| 388 |
+
|
| 389 |
+
355: ; preds = %__nv_exp2f.exit3
|
| 390 |
+
%356 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %330) #6, !dbg !35
|
| 391 |
+
br label %__nv_exp2f.exit6, !dbg !35
|
| 392 |
+
|
| 393 |
+
357: ; preds = %__nv_exp2f.exit3
|
| 394 |
+
%358 = tail call float @llvm.nvvm.ex2.approx.f(float %330) #6, !dbg !35
|
| 395 |
+
br label %__nv_exp2f.exit6, !dbg !35
|
| 396 |
+
|
| 397 |
+
__nv_exp2f.exit6: ; preds = %355, %357
|
| 398 |
+
%.0.i5 = phi float [ %356, %355 ], [ %358, %357 ], !dbg !35
|
| 399 |
+
%359 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
|
| 400 |
+
%.not.i7 = icmp eq i32 %359, 0, !dbg !35
|
| 401 |
+
br i1 %.not.i7, label %362, label %360, !dbg !35
|
| 402 |
+
|
| 403 |
+
360: ; preds = %__nv_exp2f.exit6
|
| 404 |
+
%361 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %331) #6, !dbg !35
|
| 405 |
+
br label %__nv_exp2f.exit9, !dbg !35
|
| 406 |
+
|
| 407 |
+
362: ; preds = %__nv_exp2f.exit6
|
| 408 |
+
%363 = tail call float @llvm.nvvm.ex2.approx.f(float %331) #6, !dbg !35
|
| 409 |
+
br label %__nv_exp2f.exit9, !dbg !35
|
| 410 |
+
|
| 411 |
+
__nv_exp2f.exit9: ; preds = %360, %362
|
| 412 |
+
%.0.i8 = phi float [ %361, %360 ], [ %363, %362 ], !dbg !35
|
| 413 |
+
%364 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
|
| 414 |
+
%.not.i10 = icmp eq i32 %364, 0, !dbg !35
|
| 415 |
+
br i1 %.not.i10, label %367, label %365, !dbg !35
|
| 416 |
+
|
| 417 |
+
365: ; preds = %__nv_exp2f.exit9
|
| 418 |
+
%366 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %332) #6, !dbg !35
|
| 419 |
+
br label %__nv_exp2f.exit12, !dbg !35
|
| 420 |
+
|
| 421 |
+
367: ; preds = %__nv_exp2f.exit9
|
| 422 |
+
%368 = tail call float @llvm.nvvm.ex2.approx.f(float %332) #6, !dbg !35
|
| 423 |
+
br label %__nv_exp2f.exit12, !dbg !35
|
| 424 |
+
|
| 425 |
+
__nv_exp2f.exit12: ; preds = %365, %367
|
| 426 |
+
%.0.i11 = phi float [ %366, %365 ], [ %368, %367 ], !dbg !35
|
| 427 |
+
%369 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
|
| 428 |
+
%.not.i13 = icmp eq i32 %369, 0, !dbg !35
|
| 429 |
+
br i1 %.not.i13, label %372, label %370, !dbg !35
|
| 430 |
+
|
| 431 |
+
370: ; preds = %__nv_exp2f.exit12
|
| 432 |
+
%371 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %333) #6, !dbg !35
|
| 433 |
+
br label %__nv_exp2f.exit15, !dbg !35
|
| 434 |
+
|
| 435 |
+
372: ; preds = %__nv_exp2f.exit12
|
| 436 |
+
%373 = tail call float @llvm.nvvm.ex2.approx.f(float %333) #6, !dbg !35
|
| 437 |
+
br label %__nv_exp2f.exit15, !dbg !35
|
| 438 |
+
|
| 439 |
+
__nv_exp2f.exit15: ; preds = %370, %372
|
| 440 |
+
%.0.i14 = phi float [ %371, %370 ], [ %373, %372 ], !dbg !35
|
| 441 |
+
%374 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
|
| 442 |
+
%.not.i16 = icmp eq i32 %374, 0, !dbg !35
|
| 443 |
+
br i1 %.not.i16, label %377, label %375, !dbg !35
|
| 444 |
+
|
| 445 |
+
375: ; preds = %__nv_exp2f.exit15
|
| 446 |
+
%376 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %334) #6, !dbg !35
|
| 447 |
+
br label %__nv_exp2f.exit18, !dbg !35
|
| 448 |
+
|
| 449 |
+
377: ; preds = %__nv_exp2f.exit15
|
| 450 |
+
%378 = tail call float @llvm.nvvm.ex2.approx.f(float %334) #6, !dbg !35
|
| 451 |
+
br label %__nv_exp2f.exit18, !dbg !35
|
| 452 |
+
|
| 453 |
+
__nv_exp2f.exit18: ; preds = %375, %377
|
| 454 |
+
%.0.i17 = phi float [ %376, %375 ], [ %378, %377 ], !dbg !35
|
| 455 |
+
%379 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
|
| 456 |
+
%.not.i19 = icmp eq i32 %379, 0, !dbg !35
|
| 457 |
+
br i1 %.not.i19, label %382, label %380, !dbg !35
|
| 458 |
+
|
| 459 |
+
380: ; preds = %__nv_exp2f.exit18
|
| 460 |
+
%381 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %335) #6, !dbg !35
|
| 461 |
+
br label %__nv_exp2f.exit21, !dbg !35
|
| 462 |
+
|
| 463 |
+
382: ; preds = %__nv_exp2f.exit18
|
| 464 |
+
%383 = tail call float @llvm.nvvm.ex2.approx.f(float %335) #6, !dbg !35
|
| 465 |
+
br label %__nv_exp2f.exit21, !dbg !35
|
| 466 |
+
|
| 467 |
+
__nv_exp2f.exit21: ; preds = %380, %382
|
| 468 |
+
%.0.i20 = phi float [ %381, %380 ], [ %383, %382 ], !dbg !35
|
| 469 |
+
%384 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
|
| 470 |
+
%.not.i22 = icmp eq i32 %384, 0, !dbg !35
|
| 471 |
+
br i1 %.not.i22, label %387, label %385, !dbg !35
|
| 472 |
+
|
| 473 |
+
385: ; preds = %__nv_exp2f.exit21
|
| 474 |
+
%386 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %336) #6, !dbg !35
|
| 475 |
+
br label %__nv_exp2f.exit24, !dbg !35
|
| 476 |
+
|
| 477 |
+
387: ; preds = %__nv_exp2f.exit21
|
| 478 |
+
%388 = tail call float @llvm.nvvm.ex2.approx.f(float %336) #6, !dbg !35
|
| 479 |
+
br label %__nv_exp2f.exit24, !dbg !35
|
| 480 |
+
|
| 481 |
+
__nv_exp2f.exit24: ; preds = %385, %387
|
| 482 |
+
%.0.i23 = phi float [ %386, %385 ], [ %388, %387 ], !dbg !35
|
| 483 |
+
%389 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
|
| 484 |
+
%.not.i25 = icmp eq i32 %389, 0, !dbg !35
|
| 485 |
+
br i1 %.not.i25, label %392, label %390, !dbg !35
|
| 486 |
+
|
| 487 |
+
390: ; preds = %__nv_exp2f.exit24
|
| 488 |
+
%391 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %337) #6, !dbg !35
|
| 489 |
+
br label %__nv_exp2f.exit27, !dbg !35
|
| 490 |
+
|
| 491 |
+
392: ; preds = %__nv_exp2f.exit24
|
| 492 |
+
%393 = tail call float @llvm.nvvm.ex2.approx.f(float %337) #6, !dbg !35
|
| 493 |
+
br label %__nv_exp2f.exit27, !dbg !35
|
| 494 |
+
|
| 495 |
+
__nv_exp2f.exit27: ; preds = %390, %392
|
| 496 |
+
%.0.i26 = phi float [ %391, %390 ], [ %393, %392 ], !dbg !35
|
| 497 |
+
%394 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
|
| 498 |
+
%.not.i28 = icmp eq i32 %394, 0, !dbg !35
|
| 499 |
+
br i1 %.not.i28, label %397, label %395, !dbg !35
|
| 500 |
+
|
| 501 |
+
395: ; preds = %__nv_exp2f.exit27
|
| 502 |
+
%396 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %338) #6, !dbg !35
|
| 503 |
+
br label %__nv_exp2f.exit30, !dbg !35
|
| 504 |
+
|
| 505 |
+
397: ; preds = %__nv_exp2f.exit27
|
| 506 |
+
%398 = tail call float @llvm.nvvm.ex2.approx.f(float %338) #6, !dbg !35
|
| 507 |
+
br label %__nv_exp2f.exit30, !dbg !35
|
| 508 |
+
|
| 509 |
+
__nv_exp2f.exit30: ; preds = %395, %397
|
| 510 |
+
%.0.i29 = phi float [ %396, %395 ], [ %398, %397 ], !dbg !35
|
| 511 |
+
%399 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
|
| 512 |
+
%.not.i31 = icmp eq i32 %399, 0, !dbg !35
|
| 513 |
+
br i1 %.not.i31, label %402, label %400, !dbg !35
|
| 514 |
+
|
| 515 |
+
400: ; preds = %__nv_exp2f.exit30
|
| 516 |
+
%401 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %339) #6, !dbg !35
|
| 517 |
+
br label %__nv_exp2f.exit33, !dbg !35
|
| 518 |
+
|
| 519 |
+
402: ; preds = %__nv_exp2f.exit30
|
| 520 |
+
%403 = tail call float @llvm.nvvm.ex2.approx.f(float %339) #6, !dbg !35
|
| 521 |
+
br label %__nv_exp2f.exit33, !dbg !35
|
| 522 |
+
|
| 523 |
+
__nv_exp2f.exit33: ; preds = %400, %402
|
| 524 |
+
%.0.i32 = phi float [ %401, %400 ], [ %403, %402 ], !dbg !35
|
| 525 |
+
%404 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
|
| 526 |
+
%.not.i34 = icmp eq i32 %404, 0, !dbg !35
|
| 527 |
+
br i1 %.not.i34, label %407, label %405, !dbg !35
|
| 528 |
+
|
| 529 |
+
405: ; preds = %__nv_exp2f.exit33
|
| 530 |
+
%406 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %340) #6, !dbg !35
|
| 531 |
+
br label %__nv_exp2f.exit36, !dbg !35
|
| 532 |
+
|
| 533 |
+
407: ; preds = %__nv_exp2f.exit33
|
| 534 |
+
%408 = tail call float @llvm.nvvm.ex2.approx.f(float %340) #6, !dbg !35
|
| 535 |
+
br label %__nv_exp2f.exit36, !dbg !35
|
| 536 |
+
|
| 537 |
+
__nv_exp2f.exit36: ; preds = %405, %407
|
| 538 |
+
%.0.i35 = phi float [ %406, %405 ], [ %408, %407 ], !dbg !35
|
| 539 |
+
%409 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
|
| 540 |
+
%.not.i37 = icmp eq i32 %409, 0, !dbg !35
|
| 541 |
+
br i1 %.not.i37, label %412, label %410, !dbg !35
|
| 542 |
+
|
| 543 |
+
410: ; preds = %__nv_exp2f.exit36
|
| 544 |
+
%411 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %341) #6, !dbg !35
|
| 545 |
+
br label %__nv_exp2f.exit39, !dbg !35
|
| 546 |
+
|
| 547 |
+
412: ; preds = %__nv_exp2f.exit36
|
| 548 |
+
%413 = tail call float @llvm.nvvm.ex2.approx.f(float %341) #6, !dbg !35
|
| 549 |
+
br label %__nv_exp2f.exit39, !dbg !35
|
| 550 |
+
|
| 551 |
+
__nv_exp2f.exit39: ; preds = %410, %412
|
| 552 |
+
%.0.i38 = phi float [ %411, %410 ], [ %413, %412 ], !dbg !35
|
| 553 |
+
%414 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
|
| 554 |
+
%.not.i40 = icmp eq i32 %414, 0, !dbg !35
|
| 555 |
+
br i1 %.not.i40, label %417, label %415, !dbg !35
|
| 556 |
+
|
| 557 |
+
415: ; preds = %__nv_exp2f.exit39
|
| 558 |
+
%416 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %342) #6, !dbg !35
|
| 559 |
+
br label %__nv_exp2f.exit42, !dbg !35
|
| 560 |
+
|
| 561 |
+
417: ; preds = %__nv_exp2f.exit39
|
| 562 |
+
%418 = tail call float @llvm.nvvm.ex2.approx.f(float %342) #6, !dbg !35
|
| 563 |
+
br label %__nv_exp2f.exit42, !dbg !35
|
| 564 |
+
|
| 565 |
+
__nv_exp2f.exit42: ; preds = %415, %417
|
| 566 |
+
%.0.i41 = phi float [ %416, %415 ], [ %418, %417 ], !dbg !35
|
| 567 |
+
%419 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
|
| 568 |
+
%.not.i43 = icmp eq i32 %419, 0, !dbg !35
|
| 569 |
+
br i1 %.not.i43, label %422, label %420, !dbg !35
|
| 570 |
+
|
| 571 |
+
420: ; preds = %__nv_exp2f.exit42
|
| 572 |
+
%421 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %343) #6, !dbg !35
|
| 573 |
+
br label %__nv_exp2f.exit45, !dbg !35
|
| 574 |
+
|
| 575 |
+
422: ; preds = %__nv_exp2f.exit42
|
| 576 |
+
%423 = tail call float @llvm.nvvm.ex2.approx.f(float %343) #6, !dbg !35
|
| 577 |
+
br label %__nv_exp2f.exit45, !dbg !35
|
| 578 |
+
|
| 579 |
+
__nv_exp2f.exit45: ; preds = %420, %422
|
| 580 |
+
%.0.i44 = phi float [ %421, %420 ], [ %423, %422 ], !dbg !35
|
| 581 |
+
%424 = fmul float %.0.i, %86, !dbg !36
|
| 582 |
+
%425 = fmul float %.0.i2, %87, !dbg !36
|
| 583 |
+
%426 = fmul float %.0.i5, %88, !dbg !36
|
| 584 |
+
%427 = fmul float %.0.i8, %89, !dbg !36
|
| 585 |
+
%428 = fmul float %.0.i11, %95, !dbg !36
|
| 586 |
+
%429 = fmul float %.0.i14, %96, !dbg !36
|
| 587 |
+
%430 = fmul float %.0.i17, %97, !dbg !36
|
| 588 |
+
%431 = fmul float %.0.i20, %98, !dbg !36
|
| 589 |
+
%432 = fmul float %.0.i23, %104, !dbg !36
|
| 590 |
+
%433 = fmul float %.0.i26, %105, !dbg !36
|
| 591 |
+
%434 = fmul float %.0.i29, %106, !dbg !36
|
| 592 |
+
%435 = fmul float %.0.i32, %107, !dbg !36
|
| 593 |
+
%436 = fmul float %.0.i35, %113, !dbg !36
|
| 594 |
+
%437 = fmul float %.0.i38, %114, !dbg !36
|
| 595 |
+
%438 = fmul float %.0.i41, %115, !dbg !36
|
| 596 |
+
%439 = fmul float %.0.i44, %116, !dbg !36
|
| 597 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37
|
| 598 |
+
%440 = fadd float %424, %428, !dbg !41
|
| 599 |
+
%441 = fadd float %425, %429, !dbg !41
|
| 600 |
+
%442 = fadd float %426, %430, !dbg !41
|
| 601 |
+
%443 = fadd float %427, %431, !dbg !41
|
| 602 |
+
%444 = fadd float %440, %432, !dbg !41
|
| 603 |
+
%445 = fadd float %441, %433, !dbg !41
|
| 604 |
+
%446 = fadd float %442, %434, !dbg !41
|
| 605 |
+
%447 = fadd float %443, %435, !dbg !41
|
| 606 |
+
%448 = fadd float %444, %436, !dbg !41
|
| 607 |
+
%449 = fadd float %445, %437, !dbg !41
|
| 608 |
+
%450 = fadd float %446, %438, !dbg !41
|
| 609 |
+
%451 = fadd float %447, %439, !dbg !41
|
| 610 |
+
%452 = bitcast float %448 to i32, !dbg !37
|
| 611 |
+
%453 = select i1 %18, i32 %452, i32 0, !dbg !41
|
| 612 |
+
%454 = insertelement <1 x i32> poison, i32 %453, i64 0, !dbg !37
|
| 613 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %182, <1 x i32> %454, i1 true) #6, !dbg !37
|
| 614 |
+
%455 = bitcast float %449 to i32, !dbg !37
|
| 615 |
+
%456 = select i1 %18, i32 %455, i32 0, !dbg !41
|
| 616 |
+
%457 = insertelement <1 x i32> poison, i32 %456, i64 0, !dbg !37
|
| 617 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %185, <1 x i32> %457, i1 true) #6, !dbg !37
|
| 618 |
+
%458 = bitcast float %450 to i32, !dbg !37
|
| 619 |
+
%459 = select i1 %18, i32 %458, i32 0, !dbg !41
|
| 620 |
+
%460 = insertelement <1 x i32> poison, i32 %459, i64 0, !dbg !37
|
| 621 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %460, i1 true) #6, !dbg !37
|
| 622 |
+
%461 = bitcast float %451 to i32, !dbg !37
|
| 623 |
+
%462 = select i1 %18, i32 %461, i32 0, !dbg !41
|
| 624 |
+
%463 = insertelement <1 x i32> poison, i32 %462, i64 0, !dbg !37
|
| 625 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %463, i1 true) #6, !dbg !37
|
| 626 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37
|
| 627 |
+
%464 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %193, i1 true) #6, !dbg !37
|
| 628 |
+
%465 = bitcast i32 %464 to float, !dbg !37
|
| 629 |
+
%466 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %464, i32 4, i32 31), !dbg !37
|
| 630 |
+
%467 = bitcast i32 %466 to float, !dbg !37
|
| 631 |
+
%468 = fadd float %465, %467, !dbg !41
|
| 632 |
+
%469 = bitcast float %468 to i32, !dbg !37
|
| 633 |
+
%470 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %469, i32 2, i32 31), !dbg !37
|
| 634 |
+
%471 = bitcast i32 %470 to float, !dbg !37
|
| 635 |
+
%472 = fadd float %468, %471, !dbg !41
|
| 636 |
+
%473 = bitcast float %472 to i32, !dbg !37
|
| 637 |
+
%474 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %473, i32 1, i32 31), !dbg !37
|
| 638 |
+
%475 = bitcast i32 %474 to float, !dbg !37
|
| 639 |
+
%476 = fadd float %472, %475, !dbg !41
|
| 640 |
+
%477 = bitcast float %476 to <1 x i32>, !dbg !37
|
| 641 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %193, <1 x i32> %477, i1 %216) #6, !dbg !37
|
| 642 |
+
%478 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %219, i1 true) #6, !dbg !37
|
| 643 |
+
%479 = bitcast i32 %478 to float, !dbg !37
|
| 644 |
+
%480 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %478, i32 4, i32 31), !dbg !37
|
| 645 |
+
%481 = bitcast i32 %480 to float, !dbg !37
|
| 646 |
+
%482 = fadd float %479, %481, !dbg !41
|
| 647 |
+
%483 = bitcast float %482 to i32, !dbg !37
|
| 648 |
+
%484 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %483, i32 2, i32 31), !dbg !37
|
| 649 |
+
%485 = bitcast i32 %484 to float, !dbg !37
|
| 650 |
+
%486 = fadd float %482, %485, !dbg !41
|
| 651 |
+
%487 = bitcast float %486 to i32, !dbg !37
|
| 652 |
+
%488 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %487, i32 1, i32 31), !dbg !37
|
| 653 |
+
%489 = bitcast i32 %488 to float, !dbg !37
|
| 654 |
+
%490 = fadd float %486, %489, !dbg !41
|
| 655 |
+
%491 = bitcast float %490 to <1 x i32>, !dbg !37
|
| 656 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %219, <1 x i32> %491, i1 %216) #6, !dbg !37
|
| 657 |
+
%492 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %243, i1 true) #6, !dbg !37
|
| 658 |
+
%493 = bitcast i32 %492 to float, !dbg !37
|
| 659 |
+
%494 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %492, i32 4, i32 31), !dbg !37
|
| 660 |
+
%495 = bitcast i32 %494 to float, !dbg !37
|
| 661 |
+
%496 = fadd float %493, %495, !dbg !41
|
| 662 |
+
%497 = bitcast float %496 to i32, !dbg !37
|
| 663 |
+
%498 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %497, i32 2, i32 31), !dbg !37
|
| 664 |
+
%499 = bitcast i32 %498 to float, !dbg !37
|
| 665 |
+
%500 = fadd float %496, %499, !dbg !41
|
| 666 |
+
%501 = bitcast float %500 to i32, !dbg !37
|
| 667 |
+
%502 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %501, i32 1, i32 31), !dbg !37
|
| 668 |
+
%503 = bitcast i32 %502 to float, !dbg !37
|
| 669 |
+
%504 = fadd float %500, %503, !dbg !41
|
| 670 |
+
%505 = bitcast float %504 to <1 x i32>, !dbg !37
|
| 671 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %243, <1 x i32> %505, i1 %216) #6, !dbg !37
|
| 672 |
+
%506 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %267, i1 true) #6, !dbg !37
|
| 673 |
+
%507 = bitcast i32 %506 to float, !dbg !37
|
| 674 |
+
%508 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %506, i32 4, i32 31), !dbg !37
|
| 675 |
+
%509 = bitcast i32 %508 to float, !dbg !37
|
| 676 |
+
%510 = fadd float %507, %509, !dbg !41
|
| 677 |
+
%511 = bitcast float %510 to i32, !dbg !37
|
| 678 |
+
%512 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %511, i32 2, i32 31), !dbg !37
|
| 679 |
+
%513 = bitcast i32 %512 to float, !dbg !37
|
| 680 |
+
%514 = fadd float %510, %513, !dbg !41
|
| 681 |
+
%515 = bitcast float %514 to i32, !dbg !37
|
| 682 |
+
%516 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %515, i32 1, i32 31), !dbg !37
|
| 683 |
+
%517 = bitcast i32 %516 to float, !dbg !37
|
| 684 |
+
%518 = fadd float %514, %517, !dbg !41
|
| 685 |
+
%519 = bitcast float %518 to <1 x i32>, !dbg !37
|
| 686 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %267, <1 x i32> %519, i1 %216) #6, !dbg !37
|
| 687 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37
|
| 688 |
+
%520 = load float, ptr addrspace(3) %181, align 16, !dbg !37
|
| 689 |
+
%521 = load float, ptr addrspace(3) %184, align 16, !dbg !37
|
| 690 |
+
%522 = load float, ptr addrspace(3) %187, align 16, !dbg !37
|
| 691 |
+
%523 = load float, ptr addrspace(3) %190, align 16, !dbg !37
|
| 692 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !42
|
| 693 |
+
%524 = insertelement <4 x float> poison, float %520, i64 0, !dbg !42
|
| 694 |
+
%525 = insertelement <4 x float> %524, float %521, i64 1, !dbg !42
|
| 695 |
+
%526 = insertelement <4 x float> %525, float %522, i64 2, !dbg !42
|
| 696 |
+
%527 = insertelement <4 x float> %526, float %523, i64 3, !dbg !42
|
| 697 |
+
store <4 x float> %527, ptr addrspace(3) %296, align 16, !dbg !42
|
| 698 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !42
|
| 699 |
+
%528 = tail call i32 asm sideeffect "ldmatrix.sync.aligned.m8n8.x1.shared.b16 {$0}, [$1];", "=r,r"(i32 %306) #6, !dbg !42
|
| 700 |
+
%529 = select i1 %308, float 1.000000e+00, float %520, !dbg !43
|
| 701 |
+
%530 = select i1 %309, float 1.000000e+00, float %521, !dbg !43
|
| 702 |
+
%531 = select i1 %310, float 1.000000e+00, float %522, !dbg !43
|
| 703 |
+
%532 = select i1 %311, float 1.000000e+00, float %523, !dbg !43
|
| 704 |
+
%533 = fcmp olt float %529, 0x3810000000000000, !dbg !44
|
| 705 |
+
%534 = fmul float %529, 0x4160000000000000, !dbg !44
|
| 706 |
+
%.02.i = select i1 %533, float %534, float %529, !dbg !44
|
| 707 |
+
%i.i.0.i = select i1 %533, float -2.300000e+01, float 0.000000e+00, !dbg !44
|
| 708 |
+
%535 = bitcast float %.02.i to i32, !dbg !44
|
| 709 |
+
%536 = add i32 %535, -1060439283, !dbg !44
|
| 710 |
+
%537 = and i32 %536, -8388608, !dbg !44
|
| 711 |
+
%538 = sub i32 %535, %537, !dbg !44
|
| 712 |
+
%539 = bitcast i32 %538 to float, !dbg !44
|
| 713 |
+
%540 = sitofp i32 %537 to float, !dbg !44
|
| 714 |
+
%541 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 715 |
+
%.not.i46 = icmp eq i32 %541, 0, !dbg !44
|
| 716 |
+
%542 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %540, float 0x3E80000000000000, float %i.i.0.i) #6, !dbg !44
|
| 717 |
+
%543 = tail call float @llvm.nvvm.fma.rn.f(float %540, float 0x3E80000000000000, float %i.i.0.i) #6, !dbg !44
|
| 718 |
+
%.08.i = select i1 %.not.i46, float %543, float %542, !dbg !44
|
| 719 |
+
%544 = fadd float %539, -1.000000e+00, !dbg !44
|
| 720 |
+
%545 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 721 |
+
%.not1.i = icmp eq i32 %545, 0, !dbg !44
|
| 722 |
+
%546 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0x3FB8D64FE0000000, float %544, float 0xBFC58FE600000000) #6, !dbg !44
|
| 723 |
+
%547 = tail call float @llvm.nvvm.fma.rn.f(float 0x3FB8D64FE0000000, float %544, float 0xBFC58FE600000000) #6, !dbg !44
|
| 724 |
+
%.010.i = select i1 %.not1.i, float %547, float %546, !dbg !44
|
| 725 |
+
%548 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 726 |
+
%.not2.i = icmp eq i32 %548, 0, !dbg !44
|
| 727 |
+
%549 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i, float %544, float 0x3FC5F9E540000000) #6, !dbg !44
|
| 728 |
+
%550 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i, float %544, float 0x3FC5F9E540000000) #6, !dbg !44
|
| 729 |
+
%.011.i = select i1 %.not2.i, float %550, float %549, !dbg !44
|
| 730 |
+
%551 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 731 |
+
%.not3.i = icmp eq i32 %551, 0, !dbg !44
|
| 732 |
+
%552 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i, float %544, float 0xBFC6E9C860000000) #6, !dbg !44
|
| 733 |
+
%553 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i, float %544, float 0xBFC6E9C860000000) #6, !dbg !44
|
| 734 |
+
%.012.i = select i1 %.not3.i, float %553, float %552, !dbg !44
|
| 735 |
+
%554 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 736 |
+
%.not4.i = icmp eq i32 %554, 0, !dbg !44
|
| 737 |
+
%555 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i, float %544, float 0x3FCA417E80000000) #6, !dbg !44
|
| 738 |
+
%556 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i, float %544, float 0x3FCA417E80000000) #6, !dbg !44
|
| 739 |
+
%.09.i = select i1 %.not4.i, float %556, float %555, !dbg !44
|
| 740 |
+
%557 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 741 |
+
%.not5.i = icmp eq i32 %557, 0, !dbg !44
|
| 742 |
+
%558 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i, float %544, float 0xBFCEC79160000000) #6, !dbg !44
|
| 743 |
+
%559 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i, float %544, float 0xBFCEC79160000000) #6, !dbg !44
|
| 744 |
+
%.05.i = select i1 %.not5.i, float %559, float %558, !dbg !44
|
| 745 |
+
%560 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 746 |
+
%.not6.i = icmp eq i32 %560, 0, !dbg !44
|
| 747 |
+
%561 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %544, float 0x3FD277F320000000) #6, !dbg !44
|
| 748 |
+
%562 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %544, float 0x3FD277F320000000) #6, !dbg !44
|
| 749 |
+
%.01.i = select i1 %.not6.i, float %562, float %561, !dbg !44
|
| 750 |
+
%563 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 751 |
+
%.not7.i = icmp eq i32 %563, 0, !dbg !44
|
| 752 |
+
%564 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i, float %544, float 0xBFD7154920000000) #6, !dbg !44
|
| 753 |
+
%565 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i, float %544, float 0xBFD7154920000000) #6, !dbg !44
|
| 754 |
+
%.0.i47 = select i1 %.not7.i, float %565, float %564, !dbg !44
|
| 755 |
+
%566 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 756 |
+
%.not8.i = icmp eq i32 %566, 0, !dbg !44
|
| 757 |
+
%567 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i47, float %544, float 0x3FDEC70940000000) #6, !dbg !44
|
| 758 |
+
%568 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i47, float %544, float 0x3FDEC70940000000) #6, !dbg !44
|
| 759 |
+
%.07.i = select i1 %.not8.i, float %568, float %567, !dbg !44
|
| 760 |
+
%569 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 761 |
+
%.not9.i = icmp eq i32 %569, 0, !dbg !44
|
| 762 |
+
%570 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %544, float 0xBFE7154760000000) #6, !dbg !44
|
| 763 |
+
%571 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %544, float 0xBFE7154760000000) #6, !dbg !44
|
| 764 |
+
%.06.i = select i1 %.not9.i, float %571, float %570, !dbg !44
|
| 765 |
+
%572 = fmul float %544, %.06.i, !dbg !44
|
| 766 |
+
%573 = fmul float %544, %572, !dbg !44
|
| 767 |
+
%574 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 768 |
+
%.not10.i = icmp eq i32 %574, 0, !dbg !44
|
| 769 |
+
%575 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %544, float 0x3FF7154760000000, float %573) #6, !dbg !44
|
| 770 |
+
%576 = tail call float @llvm.nvvm.fma.rn.f(float %544, float 0x3FF7154760000000, float %573) #6, !dbg !44
|
| 771 |
+
%.04.i = select i1 %.not10.i, float %576, float %575, !dbg !44
|
| 772 |
+
%577 = fadd float %.08.i, %.04.i, !dbg !44
|
| 773 |
+
%578 = icmp ugt i32 %535, 2139095039, !dbg !44
|
| 774 |
+
br i1 %578, label %__nv_fmaf_rn.exit.i.i, label %__nv_log2f.exit, !dbg !44
|
| 775 |
+
|
| 776 |
+
__nv_fmaf_rn.exit.i.i: ; preds = %__nv_exp2f.exit45
|
| 777 |
+
%579 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 778 |
+
%.not11.i = icmp eq i32 %579, 0, !dbg !44
|
| 779 |
+
%580 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !44
|
| 780 |
+
%581 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !44
|
| 781 |
+
%.03.i = select i1 %.not11.i, float %581, float %580, !dbg !44
|
| 782 |
+
br label %__nv_log2f.exit, !dbg !44
|
| 783 |
+
|
| 784 |
+
__nv_log2f.exit: ; preds = %__nv_exp2f.exit45, %__nv_fmaf_rn.exit.i.i
|
| 785 |
+
%r.i.0.i = phi float [ %.03.i, %__nv_fmaf_rn.exit.i.i ], [ %577, %__nv_exp2f.exit45 ], !dbg !44
|
| 786 |
+
%582 = fcmp olt float %530, 0x3810000000000000, !dbg !44
|
| 787 |
+
%583 = fmul float %530, 0x4160000000000000, !dbg !44
|
| 788 |
+
%.02.i48 = select i1 %582, float %583, float %530, !dbg !44
|
| 789 |
+
%i.i.0.i49 = select i1 %582, float -2.300000e+01, float 0.000000e+00, !dbg !44
|
| 790 |
+
%584 = bitcast float %.02.i48 to i32, !dbg !44
|
| 791 |
+
%585 = add i32 %584, -1060439283, !dbg !44
|
| 792 |
+
%586 = and i32 %585, -8388608, !dbg !44
|
| 793 |
+
%587 = sub i32 %584, %586, !dbg !44
|
| 794 |
+
%588 = bitcast i32 %587 to float, !dbg !44
|
| 795 |
+
%589 = sitofp i32 %586 to float, !dbg !44
|
| 796 |
+
%590 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 797 |
+
%.not.i50 = icmp eq i32 %590, 0, !dbg !44
|
| 798 |
+
%591 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %589, float 0x3E80000000000000, float %i.i.0.i49) #6, !dbg !44
|
| 799 |
+
%592 = tail call float @llvm.nvvm.fma.rn.f(float %589, float 0x3E80000000000000, float %i.i.0.i49) #6, !dbg !44
|
| 800 |
+
%.08.i51 = select i1 %.not.i50, float %592, float %591, !dbg !44
|
| 801 |
+
%593 = fadd float %588, -1.000000e+00, !dbg !44
|
| 802 |
+
%594 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 803 |
+
%.not1.i52 = icmp eq i32 %594, 0, !dbg !44
|
| 804 |
+
%595 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0x3FB8D64FE0000000, float %593, float 0xBFC58FE600000000) #6, !dbg !44
|
| 805 |
+
%596 = tail call float @llvm.nvvm.fma.rn.f(float 0x3FB8D64FE0000000, float %593, float 0xBFC58FE600000000) #6, !dbg !44
|
| 806 |
+
%.010.i53 = select i1 %.not1.i52, float %596, float %595, !dbg !44
|
| 807 |
+
%597 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 808 |
+
%.not2.i54 = icmp eq i32 %597, 0, !dbg !44
|
| 809 |
+
%598 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i53, float %593, float 0x3FC5F9E540000000) #6, !dbg !44
|
| 810 |
+
%599 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i53, float %593, float 0x3FC5F9E540000000) #6, !dbg !44
|
| 811 |
+
%.011.i55 = select i1 %.not2.i54, float %599, float %598, !dbg !44
|
| 812 |
+
%600 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 813 |
+
%.not3.i56 = icmp eq i32 %600, 0, !dbg !44
|
| 814 |
+
%601 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i55, float %593, float 0xBFC6E9C860000000) #6, !dbg !44
|
| 815 |
+
%602 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i55, float %593, float 0xBFC6E9C860000000) #6, !dbg !44
|
| 816 |
+
%.012.i57 = select i1 %.not3.i56, float %602, float %601, !dbg !44
|
| 817 |
+
%603 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 818 |
+
%.not4.i58 = icmp eq i32 %603, 0, !dbg !44
|
| 819 |
+
%604 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i57, float %593, float 0x3FCA417E80000000) #6, !dbg !44
|
| 820 |
+
%605 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i57, float %593, float 0x3FCA417E80000000) #6, !dbg !44
|
| 821 |
+
%.09.i59 = select i1 %.not4.i58, float %605, float %604, !dbg !44
|
| 822 |
+
%606 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 823 |
+
%.not5.i60 = icmp eq i32 %606, 0, !dbg !44
|
| 824 |
+
%607 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i59, float %593, float 0xBFCEC79160000000) #6, !dbg !44
|
| 825 |
+
%608 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i59, float %593, float 0xBFCEC79160000000) #6, !dbg !44
|
| 826 |
+
%.05.i61 = select i1 %.not5.i60, float %608, float %607, !dbg !44
|
| 827 |
+
%609 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 828 |
+
%.not6.i62 = icmp eq i32 %609, 0, !dbg !44
|
| 829 |
+
%610 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i61, float %593, float 0x3FD277F320000000) #6, !dbg !44
|
| 830 |
+
%611 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i61, float %593, float 0x3FD277F320000000) #6, !dbg !44
|
| 831 |
+
%.01.i63 = select i1 %.not6.i62, float %611, float %610, !dbg !44
|
| 832 |
+
%612 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 833 |
+
%.not7.i64 = icmp eq i32 %612, 0, !dbg !44
|
| 834 |
+
%613 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i63, float %593, float 0xBFD7154920000000) #6, !dbg !44
|
| 835 |
+
%614 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i63, float %593, float 0xBFD7154920000000) #6, !dbg !44
|
| 836 |
+
%.0.i65 = select i1 %.not7.i64, float %614, float %613, !dbg !44
|
| 837 |
+
%615 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 838 |
+
%.not8.i66 = icmp eq i32 %615, 0, !dbg !44
|
| 839 |
+
%616 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i65, float %593, float 0x3FDEC70940000000) #6, !dbg !44
|
| 840 |
+
%617 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i65, float %593, float 0x3FDEC70940000000) #6, !dbg !44
|
| 841 |
+
%.07.i67 = select i1 %.not8.i66, float %617, float %616, !dbg !44
|
| 842 |
+
%618 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 843 |
+
%.not9.i68 = icmp eq i32 %618, 0, !dbg !44
|
| 844 |
+
%619 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i67, float %593, float 0xBFE7154760000000) #6, !dbg !44
|
| 845 |
+
%620 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i67, float %593, float 0xBFE7154760000000) #6, !dbg !44
|
| 846 |
+
%.06.i69 = select i1 %.not9.i68, float %620, float %619, !dbg !44
|
| 847 |
+
%621 = fmul float %593, %.06.i69, !dbg !44
|
| 848 |
+
%622 = fmul float %593, %621, !dbg !44
|
| 849 |
+
%623 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 850 |
+
%.not10.i70 = icmp eq i32 %623, 0, !dbg !44
|
| 851 |
+
%624 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %593, float 0x3FF7154760000000, float %622) #6, !dbg !44
|
| 852 |
+
%625 = tail call float @llvm.nvvm.fma.rn.f(float %593, float 0x3FF7154760000000, float %622) #6, !dbg !44
|
| 853 |
+
%.04.i71 = select i1 %.not10.i70, float %625, float %624, !dbg !44
|
| 854 |
+
%626 = fadd float %.08.i51, %.04.i71, !dbg !44
|
| 855 |
+
%627 = icmp ugt i32 %584, 2139095039, !dbg !44
|
| 856 |
+
br i1 %627, label %__nv_fmaf_rn.exit.i.i74, label %__nv_log2f.exit77, !dbg !44
|
| 857 |
+
|
| 858 |
+
__nv_fmaf_rn.exit.i.i74: ; preds = %__nv_log2f.exit
|
| 859 |
+
%628 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 860 |
+
%.not11.i75 = icmp eq i32 %628, 0, !dbg !44
|
| 861 |
+
%629 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i48, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !44
|
| 862 |
+
%630 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i48, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !44
|
| 863 |
+
%.03.i76 = select i1 %.not11.i75, float %630, float %629, !dbg !44
|
| 864 |
+
br label %__nv_log2f.exit77, !dbg !44
|
| 865 |
+
|
| 866 |
+
__nv_log2f.exit77: ; preds = %__nv_log2f.exit, %__nv_fmaf_rn.exit.i.i74
|
| 867 |
+
%r.i.0.i72 = phi float [ %.03.i76, %__nv_fmaf_rn.exit.i.i74 ], [ %626, %__nv_log2f.exit ], !dbg !44
|
| 868 |
+
%631 = fcmp olt float %531, 0x3810000000000000, !dbg !44
|
| 869 |
+
%632 = fmul float %531, 0x4160000000000000, !dbg !44
|
| 870 |
+
%.02.i78 = select i1 %631, float %632, float %531, !dbg !44
|
| 871 |
+
%i.i.0.i79 = select i1 %631, float -2.300000e+01, float 0.000000e+00, !dbg !44
|
| 872 |
+
%633 = bitcast float %.02.i78 to i32, !dbg !44
|
| 873 |
+
%634 = add i32 %633, -1060439283, !dbg !44
|
| 874 |
+
%635 = and i32 %634, -8388608, !dbg !44
|
| 875 |
+
%636 = sub i32 %633, %635, !dbg !44
|
| 876 |
+
%637 = bitcast i32 %636 to float, !dbg !44
|
| 877 |
+
%638 = sitofp i32 %635 to float, !dbg !44
|
| 878 |
+
%639 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 879 |
+
%.not.i80 = icmp eq i32 %639, 0, !dbg !44
|
| 880 |
+
%640 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %638, float 0x3E80000000000000, float %i.i.0.i79) #6, !dbg !44
|
| 881 |
+
%641 = tail call float @llvm.nvvm.fma.rn.f(float %638, float 0x3E80000000000000, float %i.i.0.i79) #6, !dbg !44
|
| 882 |
+
%.08.i81 = select i1 %.not.i80, float %641, float %640, !dbg !44
|
| 883 |
+
%642 = fadd float %637, -1.000000e+00, !dbg !44
|
| 884 |
+
%643 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 885 |
+
%.not1.i82 = icmp eq i32 %643, 0, !dbg !44
|
| 886 |
+
%644 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0x3FB8D64FE0000000, float %642, float 0xBFC58FE600000000) #6, !dbg !44
|
| 887 |
+
%645 = tail call float @llvm.nvvm.fma.rn.f(float 0x3FB8D64FE0000000, float %642, float 0xBFC58FE600000000) #6, !dbg !44
|
| 888 |
+
%.010.i83 = select i1 %.not1.i82, float %645, float %644, !dbg !44
|
| 889 |
+
%646 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 890 |
+
%.not2.i84 = icmp eq i32 %646, 0, !dbg !44
|
| 891 |
+
%647 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i83, float %642, float 0x3FC5F9E540000000) #6, !dbg !44
|
| 892 |
+
%648 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i83, float %642, float 0x3FC5F9E540000000) #6, !dbg !44
|
| 893 |
+
%.011.i85 = select i1 %.not2.i84, float %648, float %647, !dbg !44
|
| 894 |
+
%649 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 895 |
+
%.not3.i86 = icmp eq i32 %649, 0, !dbg !44
|
| 896 |
+
%650 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i85, float %642, float 0xBFC6E9C860000000) #6, !dbg !44
|
| 897 |
+
%651 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i85, float %642, float 0xBFC6E9C860000000) #6, !dbg !44
|
| 898 |
+
%.012.i87 = select i1 %.not3.i86, float %651, float %650, !dbg !44
|
| 899 |
+
%652 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 900 |
+
%.not4.i88 = icmp eq i32 %652, 0, !dbg !44
|
| 901 |
+
%653 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i87, float %642, float 0x3FCA417E80000000) #6, !dbg !44
|
| 902 |
+
%654 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i87, float %642, float 0x3FCA417E80000000) #6, !dbg !44
|
| 903 |
+
%.09.i89 = select i1 %.not4.i88, float %654, float %653, !dbg !44
|
| 904 |
+
%655 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 905 |
+
%.not5.i90 = icmp eq i32 %655, 0, !dbg !44
|
| 906 |
+
%656 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i89, float %642, float 0xBFCEC79160000000) #6, !dbg !44
|
| 907 |
+
%657 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i89, float %642, float 0xBFCEC79160000000) #6, !dbg !44
|
| 908 |
+
%.05.i91 = select i1 %.not5.i90, float %657, float %656, !dbg !44
|
| 909 |
+
%658 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 910 |
+
%.not6.i92 = icmp eq i32 %658, 0, !dbg !44
|
| 911 |
+
%659 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i91, float %642, float 0x3FD277F320000000) #6, !dbg !44
|
| 912 |
+
%660 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i91, float %642, float 0x3FD277F320000000) #6, !dbg !44
|
| 913 |
+
%.01.i93 = select i1 %.not6.i92, float %660, float %659, !dbg !44
|
| 914 |
+
%661 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 915 |
+
%.not7.i94 = icmp eq i32 %661, 0, !dbg !44
|
| 916 |
+
%662 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i93, float %642, float 0xBFD7154920000000) #6, !dbg !44
|
| 917 |
+
%663 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i93, float %642, float 0xBFD7154920000000) #6, !dbg !44
|
| 918 |
+
%.0.i95 = select i1 %.not7.i94, float %663, float %662, !dbg !44
|
| 919 |
+
%664 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 920 |
+
%.not8.i96 = icmp eq i32 %664, 0, !dbg !44
|
| 921 |
+
%665 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i95, float %642, float 0x3FDEC70940000000) #6, !dbg !44
|
| 922 |
+
%666 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i95, float %642, float 0x3FDEC70940000000) #6, !dbg !44
|
| 923 |
+
%.07.i97 = select i1 %.not8.i96, float %666, float %665, !dbg !44
|
| 924 |
+
%667 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 925 |
+
%.not9.i98 = icmp eq i32 %667, 0, !dbg !44
|
| 926 |
+
%668 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i97, float %642, float 0xBFE7154760000000) #6, !dbg !44
|
| 927 |
+
%669 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i97, float %642, float 0xBFE7154760000000) #6, !dbg !44
|
| 928 |
+
%.06.i99 = select i1 %.not9.i98, float %669, float %668, !dbg !44
|
| 929 |
+
%670 = fmul float %642, %.06.i99, !dbg !44
|
| 930 |
+
%671 = fmul float %642, %670, !dbg !44
|
| 931 |
+
%672 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 932 |
+
%.not10.i100 = icmp eq i32 %672, 0, !dbg !44
|
| 933 |
+
%673 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %642, float 0x3FF7154760000000, float %671) #6, !dbg !44
|
| 934 |
+
%674 = tail call float @llvm.nvvm.fma.rn.f(float %642, float 0x3FF7154760000000, float %671) #6, !dbg !44
|
| 935 |
+
%.04.i101 = select i1 %.not10.i100, float %674, float %673, !dbg !44
|
| 936 |
+
%675 = fadd float %.08.i81, %.04.i101, !dbg !44
|
| 937 |
+
%676 = icmp ugt i32 %633, 2139095039, !dbg !44
|
| 938 |
+
br i1 %676, label %__nv_fmaf_rn.exit.i.i104, label %__nv_log2f.exit107, !dbg !44
|
| 939 |
+
|
| 940 |
+
__nv_fmaf_rn.exit.i.i104: ; preds = %__nv_log2f.exit77
|
| 941 |
+
%677 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 942 |
+
%.not11.i105 = icmp eq i32 %677, 0, !dbg !44
|
| 943 |
+
%678 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i78, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !44
|
| 944 |
+
%679 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i78, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !44
|
| 945 |
+
%.03.i106 = select i1 %.not11.i105, float %679, float %678, !dbg !44
|
| 946 |
+
br label %__nv_log2f.exit107, !dbg !44
|
| 947 |
+
|
| 948 |
+
__nv_log2f.exit107: ; preds = %__nv_log2f.exit77, %__nv_fmaf_rn.exit.i.i104
|
| 949 |
+
%r.i.0.i102 = phi float [ %.03.i106, %__nv_fmaf_rn.exit.i.i104 ], [ %675, %__nv_log2f.exit77 ], !dbg !44
|
| 950 |
+
%680 = fcmp olt float %532, 0x3810000000000000, !dbg !44
|
| 951 |
+
%681 = fmul float %532, 0x4160000000000000, !dbg !44
|
| 952 |
+
%.02.i108 = select i1 %680, float %681, float %532, !dbg !44
|
| 953 |
+
%i.i.0.i109 = select i1 %680, float -2.300000e+01, float 0.000000e+00, !dbg !44
|
| 954 |
+
%682 = bitcast float %.02.i108 to i32, !dbg !44
|
| 955 |
+
%683 = add i32 %682, -1060439283, !dbg !44
|
| 956 |
+
%684 = and i32 %683, -8388608, !dbg !44
|
| 957 |
+
%685 = sub i32 %682, %684, !dbg !44
|
| 958 |
+
%686 = bitcast i32 %685 to float, !dbg !44
|
| 959 |
+
%687 = sitofp i32 %684 to float, !dbg !44
|
| 960 |
+
%688 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 961 |
+
%.not.i110 = icmp eq i32 %688, 0, !dbg !44
|
| 962 |
+
%689 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %687, float 0x3E80000000000000, float %i.i.0.i109) #6, !dbg !44
|
| 963 |
+
%690 = tail call float @llvm.nvvm.fma.rn.f(float %687, float 0x3E80000000000000, float %i.i.0.i109) #6, !dbg !44
|
| 964 |
+
%.08.i111 = select i1 %.not.i110, float %690, float %689, !dbg !44
|
| 965 |
+
%691 = fadd float %686, -1.000000e+00, !dbg !44
|
| 966 |
+
%692 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 967 |
+
%.not1.i112 = icmp eq i32 %692, 0, !dbg !44
|
| 968 |
+
%693 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0x3FB8D64FE0000000, float %691, float 0xBFC58FE600000000) #6, !dbg !44
|
| 969 |
+
%694 = tail call float @llvm.nvvm.fma.rn.f(float 0x3FB8D64FE0000000, float %691, float 0xBFC58FE600000000) #6, !dbg !44
|
| 970 |
+
%.010.i113 = select i1 %.not1.i112, float %694, float %693, !dbg !44
|
| 971 |
+
%695 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 972 |
+
%.not2.i114 = icmp eq i32 %695, 0, !dbg !44
|
| 973 |
+
%696 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i113, float %691, float 0x3FC5F9E540000000) #6, !dbg !44
|
| 974 |
+
%697 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i113, float %691, float 0x3FC5F9E540000000) #6, !dbg !44
|
| 975 |
+
%.011.i115 = select i1 %.not2.i114, float %697, float %696, !dbg !44
|
| 976 |
+
%698 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 977 |
+
%.not3.i116 = icmp eq i32 %698, 0, !dbg !44
|
| 978 |
+
%699 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i115, float %691, float 0xBFC6E9C860000000) #6, !dbg !44
|
| 979 |
+
%700 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i115, float %691, float 0xBFC6E9C860000000) #6, !dbg !44
|
| 980 |
+
%.012.i117 = select i1 %.not3.i116, float %700, float %699, !dbg !44
|
| 981 |
+
%701 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 982 |
+
%.not4.i118 = icmp eq i32 %701, 0, !dbg !44
|
| 983 |
+
%702 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i117, float %691, float 0x3FCA417E80000000) #6, !dbg !44
|
| 984 |
+
%703 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i117, float %691, float 0x3FCA417E80000000) #6, !dbg !44
|
| 985 |
+
%.09.i119 = select i1 %.not4.i118, float %703, float %702, !dbg !44
|
| 986 |
+
%704 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 987 |
+
%.not5.i120 = icmp eq i32 %704, 0, !dbg !44
|
| 988 |
+
%705 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i119, float %691, float 0xBFCEC79160000000) #6, !dbg !44
|
| 989 |
+
%706 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i119, float %691, float 0xBFCEC79160000000) #6, !dbg !44
|
| 990 |
+
%.05.i121 = select i1 %.not5.i120, float %706, float %705, !dbg !44
|
| 991 |
+
%707 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 992 |
+
%.not6.i122 = icmp eq i32 %707, 0, !dbg !44
|
| 993 |
+
%708 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i121, float %691, float 0x3FD277F320000000) #6, !dbg !44
|
| 994 |
+
%709 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i121, float %691, float 0x3FD277F320000000) #6, !dbg !44
|
| 995 |
+
%.01.i123 = select i1 %.not6.i122, float %709, float %708, !dbg !44
|
| 996 |
+
%710 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 997 |
+
%.not7.i124 = icmp eq i32 %710, 0, !dbg !44
|
| 998 |
+
%711 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i123, float %691, float 0xBFD7154920000000) #6, !dbg !44
|
| 999 |
+
%712 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i123, float %691, float 0xBFD7154920000000) #6, !dbg !44
|
| 1000 |
+
%.0.i125 = select i1 %.not7.i124, float %712, float %711, !dbg !44
|
| 1001 |
+
%713 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 1002 |
+
%.not8.i126 = icmp eq i32 %713, 0, !dbg !44
|
| 1003 |
+
%714 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i125, float %691, float 0x3FDEC70940000000) #6, !dbg !44
|
| 1004 |
+
%715 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i125, float %691, float 0x3FDEC70940000000) #6, !dbg !44
|
| 1005 |
+
%.07.i127 = select i1 %.not8.i126, float %715, float %714, !dbg !44
|
| 1006 |
+
%716 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 1007 |
+
%.not9.i128 = icmp eq i32 %716, 0, !dbg !44
|
| 1008 |
+
%717 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i127, float %691, float 0xBFE7154760000000) #6, !dbg !44
|
| 1009 |
+
%718 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i127, float %691, float 0xBFE7154760000000) #6, !dbg !44
|
| 1010 |
+
%.06.i129 = select i1 %.not9.i128, float %718, float %717, !dbg !44
|
| 1011 |
+
%719 = fmul float %691, %.06.i129, !dbg !44
|
| 1012 |
+
%720 = fmul float %691, %719, !dbg !44
|
| 1013 |
+
%721 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 1014 |
+
%.not10.i130 = icmp eq i32 %721, 0, !dbg !44
|
| 1015 |
+
%722 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %691, float 0x3FF7154760000000, float %720) #6, !dbg !44
|
| 1016 |
+
%723 = tail call float @llvm.nvvm.fma.rn.f(float %691, float 0x3FF7154760000000, float %720) #6, !dbg !44
|
| 1017 |
+
%.04.i131 = select i1 %.not10.i130, float %723, float %722, !dbg !44
|
| 1018 |
+
%724 = fadd float %.08.i111, %.04.i131, !dbg !44
|
| 1019 |
+
%725 = icmp ugt i32 %682, 2139095039, !dbg !44
|
| 1020 |
+
br i1 %725, label %__nv_fmaf_rn.exit.i.i134, label %__nv_log2f.exit137, !dbg !44
|
| 1021 |
+
|
| 1022 |
+
__nv_fmaf_rn.exit.i.i134: ; preds = %__nv_log2f.exit107
|
| 1023 |
+
%726 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
|
| 1024 |
+
%.not11.i135 = icmp eq i32 %726, 0, !dbg !44
|
| 1025 |
+
%727 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i108, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !44
|
| 1026 |
+
%728 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i108, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !44
|
| 1027 |
+
%.03.i136 = select i1 %.not11.i135, float %728, float %727, !dbg !44
|
| 1028 |
+
br label %__nv_log2f.exit137, !dbg !44
|
| 1029 |
+
|
| 1030 |
+
__nv_log2f.exit137: ; preds = %__nv_log2f.exit107, %__nv_fmaf_rn.exit.i.i134
|
| 1031 |
+
%r.i.0.i132 = phi float [ %.03.i136, %__nv_fmaf_rn.exit.i.i134 ], [ %724, %__nv_log2f.exit107 ], !dbg !44
|
| 1032 |
+
%729 = insertelement <4 x float> poison, float %.02.i, i64 0, !dbg !44
|
| 1033 |
+
%730 = insertelement <4 x float> %729, float %.02.i48, i64 1, !dbg !44
|
| 1034 |
+
%731 = insertelement <4 x float> %730, float %.02.i78, i64 2, !dbg !44
|
| 1035 |
+
%732 = insertelement <4 x float> %731, float %.02.i108, i64 3, !dbg !44
|
| 1036 |
+
%733 = fcmp oeq <4 x float> %732, zeroinitializer, !dbg !44
|
| 1037 |
+
%734 = and i32 %13, 127, !dbg !10
|
| 1038 |
+
%735 = or disjoint i32 %12, %734, !dbg !11
|
| 1039 |
+
%736 = sext i32 %735 to i64, !dbg !14
|
| 1040 |
+
%.frozen = freeze i64 %5, !dbg !45
|
| 1041 |
+
%737 = sdiv i64 %736, %.frozen, !dbg !45
|
| 1042 |
+
%738 = icmp slt i32 %12, 0, !dbg !47
|
| 1043 |
+
%739 = icmp slt i64 %5, 0, !dbg !48
|
| 1044 |
+
%740 = xor i1 %738, %739, !dbg !49
|
| 1045 |
+
%741 = mul i64 %737, %.frozen, !dbg !14
|
| 1046 |
+
%.decomposed = sub i64 %736, %741, !dbg !14
|
| 1047 |
+
%.not = icmp ne i64 %.decomposed, 0, !dbg !50
|
| 1048 |
+
%narrow = select i1 %740, i1 %.not, i1 false, !dbg !51
|
| 1049 |
+
%742 = sext i1 %narrow to i64, !dbg !51
|
| 1050 |
+
%743 = add nsw i64 %737, %742, !dbg !51
|
| 1051 |
+
%744 = icmp slt i32 %735, %6, !dbg !12
|
| 1052 |
+
%745 = insertelement <4 x float> poison, float %r.i.0.i, i64 0, !dbg !44
|
| 1053 |
+
%746 = insertelement <4 x float> %745, float %r.i.0.i72, i64 1, !dbg !44
|
| 1054 |
+
%747 = insertelement <4 x float> %746, float %r.i.0.i102, i64 2, !dbg !44
|
| 1055 |
+
%748 = insertelement <4 x float> %747, float %r.i.0.i132, i64 3, !dbg !44
|
| 1056 |
+
%749 = select <4 x i1> %733, <4 x float> splat (float 0xFFF0000000000000), <4 x float> %748, !dbg !44
|
| 1057 |
+
%750 = insertelement <4 x float> poison, float %291, i64 0, !dbg !52
|
| 1058 |
+
%751 = insertelement <4 x float> %750, float %292, i64 1, !dbg !52
|
| 1059 |
+
%752 = insertelement <4 x float> %751, float %293, i64 2, !dbg !52
|
| 1060 |
+
%753 = insertelement <4 x float> %752, float %294, i64 3, !dbg !52
|
| 1061 |
+
%754 = fadd <4 x float> %753, %749, !dbg !52
|
| 1062 |
+
%755 = fmul <4 x float> %754, splat (float 0x3FE62E4300000000), !dbg !53
|
| 1063 |
+
%756 = icmp slt i64 %5, 2, !dbg !54
|
| 1064 |
+
%757 = icmp sgt i64 %5, 1, !dbg !55
|
| 1065 |
+
%758 = select i1 %757, i64 %5, i64 0, !dbg !56
|
| 1066 |
+
%759 = zext i1 %756 to i64, !dbg !57
|
| 1067 |
+
%760 = add i64 %758, %759, !dbg !58
|
| 1068 |
+
%761 = mul i64 %743, %760, !dbg !59
|
| 1069 |
+
%762 = getelementptr float, ptr addrspace(1) %4, i64 %.decomposed, !dbg !60
|
| 1070 |
+
%763 = getelementptr float, ptr addrspace(1) %762, i64 %761, !dbg !60
|
| 1071 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61
|
| 1072 |
+
store <4 x float> %755, ptr addrspace(3) %296, align 16, !dbg !61
|
| 1073 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61
|
| 1074 |
+
%764 = tail call i32 asm sideeffect "ldmatrix.sync.aligned.m8n8.x1.shared.b16 {$0}, [$1];", "=r,r"(i32 %306) #6, !dbg !61
|
| 1075 |
+
%765 = and i32 %13, 128, !dbg !61
|
| 1076 |
+
%766 = icmp eq i32 %765, 0, !dbg !61
|
| 1077 |
+
%767 = and i1 %766, %744, !dbg !61
|
| 1078 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %764, ptr addrspace(1) %763, i1 %767) #6, !dbg !61
|
| 1079 |
+
%768 = getelementptr float, ptr addrspace(1) %2, i64 %736, !dbg !62
|
| 1080 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %307, ptr addrspace(1) %768, i1 %767) #6, !dbg !63
|
| 1081 |
+
%769 = getelementptr float, ptr addrspace(1) %3, i64 %736, !dbg !64
|
| 1082 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %528, ptr addrspace(1) %769, i1 %767) #6, !dbg !65
|
| 1083 |
+
ret void, !dbg !66
|
| 1084 |
+
}
|
| 1085 |
+
|
| 1086 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 1087 |
+
declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
|
| 1088 |
+
|
| 1089 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 1090 |
+
declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
|
| 1091 |
+
|
| 1092 |
+
; Function Attrs: convergent nocallback nounwind
|
| 1093 |
+
declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2
|
| 1094 |
+
|
| 1095 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
| 1096 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3
|
| 1097 |
+
|
| 1098 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
|
| 1099 |
+
|
| 1100 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
| 1101 |
+
declare float @llvm.nvvm.ex2.approx.ftz.f(float) #5
|
| 1102 |
+
|
| 1103 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
| 1104 |
+
declare float @llvm.nvvm.ex2.approx.f(float) #5
|
| 1105 |
+
|
| 1106 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 1107 |
+
declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #1
|
| 1108 |
+
|
| 1109 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 1110 |
+
declare float @llvm.nvvm.fma.rn.f(float, float, float) #1
|
| 1111 |
+
|
| 1112 |
+
attributes #0 = { nounwind "nvvm.reqntid"="256" }
|
| 1113 |
+
attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
| 1114 |
+
attributes #2 = { convergent nocallback nounwind }
|
| 1115 |
+
attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
| 1116 |
+
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
| 1117 |
+
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
| 1118 |
+
attributes #6 = { nounwind }
|
| 1119 |
+
|
| 1120 |
+
!llvm.dbg.cu = !{!0}
|
| 1121 |
+
!llvm.module.flags = !{!2, !3}
|
| 1122 |
+
!llvm.ident = !{!4}
|
| 1123 |
+
|
| 1124 |
+
!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
|
| 1125 |
+
!1 = !DIFile(filename: "c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py", directory: "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c")
|
| 1126 |
+
!2 = !{i32 2, !"Debug Info Version", i32 3}
|
| 1127 |
+
!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
| 1128 |
+
!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
| 1129 |
+
!5 = distinct !DISubprogram(name: "triton_per_fused_mul_1", linkageName: "triton_per_fused_mul_1", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
|
| 1130 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
| 1131 |
+
!7 = !{}
|
| 1132 |
+
!8 = !DILocation(line: 23, column: 28, scope: !5)
|
| 1133 |
+
!9 = !DILocation(line: 23, column: 33, scope: !5)
|
| 1134 |
+
!10 = !DILocation(line: 24, column: 44, scope: !5)
|
| 1135 |
+
!11 = !DILocation(line: 24, column: 23, scope: !5)
|
| 1136 |
+
!12 = !DILocation(line: 25, column: 21, scope: !5)
|
| 1137 |
+
!13 = !DILocation(line: 26, column: 38, scope: !5)
|
| 1138 |
+
!14 = !DILocation(line: 33, column: 19, scope: !5)
|
| 1139 |
+
!15 = !DILocation(line: 35, column: 38, scope: !5)
|
| 1140 |
+
!16 = !DILocation(line: 35, column: 42, scope: !5)
|
| 1141 |
+
!17 = !DILocation(line: 35, column: 35, scope: !5)
|
| 1142 |
+
!18 = !DILocation(line: 35, column: 30, scope: !5)
|
| 1143 |
+
!19 = !DILocation(line: 35, column: 49, scope: !5)
|
| 1144 |
+
!20 = !DILocation(line: 36, column: 30, scope: !5)
|
| 1145 |
+
!21 = !DILocation(line: 36, column: 49, scope: !5)
|
| 1146 |
+
!22 = !DILocation(line: 38, column: 33, scope: !5)
|
| 1147 |
+
!23 = !DILocation(line: 110, column: 15, scope: !24, inlinedAt: !26)
|
| 1148 |
+
!24 = distinct !DILexicalBlockFile(scope: !5, file: !25, discriminator: 0)
|
| 1149 |
+
!25 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime")
|
| 1150 |
+
!26 = !DILocation(line: 39, column: 37, scope: !5)
|
| 1151 |
+
!27 = !DILocation(line: 112, column: 21, scope: !24, inlinedAt: !26)
|
| 1152 |
+
!28 = !DILocation(line: 112, column: 16, scope: !24, inlinedAt: !26)
|
| 1153 |
+
!29 = !DILocation(line: 113, column: 29, scope: !24, inlinedAt: !26)
|
| 1154 |
+
!30 = !DILocation(line: 123, column: 29, scope: !24, inlinedAt: !26)
|
| 1155 |
+
!31 = !DILocation(line: 39, column: 40, scope: !5)
|
| 1156 |
+
!32 = !DILocation(line: 41, column: 19, scope: !5)
|
| 1157 |
+
!33 = !DILocation(line: 42, column: 18, scope: !5)
|
| 1158 |
+
!34 = !DILocation(line: 44, column: 33, scope: !5)
|
| 1159 |
+
!35 = !DILocation(line: 45, column: 27, scope: !5)
|
| 1160 |
+
!36 = !DILocation(line: 46, column: 19, scope: !5)
|
| 1161 |
+
!37 = !DILocation(line: 291, column: 36, scope: !38, inlinedAt: !40)
|
| 1162 |
+
!38 = distinct !DILexicalBlockFile(scope: !5, file: !39, discriminator: 0)
|
| 1163 |
+
!39 = !DIFile(filename: "standard.py", directory: "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language")
|
| 1164 |
+
!40 = !DILocation(line: 49, column: 26, scope: !5)
|
| 1165 |
+
!41 = !DILocation(line: 261, column: 15, scope: !38, inlinedAt: !40)
|
| 1166 |
+
!42 = !DILocation(line: 49, column: 29, scope: !5)
|
| 1167 |
+
!43 = !DILocation(line: 51, column: 34, scope: !5)
|
| 1168 |
+
!44 = !DILocation(line: 52, column: 27, scope: !5)
|
| 1169 |
+
!45 = !DILocation(line: 72, column: 16, scope: !24, inlinedAt: !46)
|
| 1170 |
+
!46 = !DILocation(line: 34, column: 51, scope: !5)
|
| 1171 |
+
!47 = !DILocation(line: 75, column: 25, scope: !24, inlinedAt: !46)
|
| 1172 |
+
!48 = !DILocation(line: 75, column: 36, scope: !24, inlinedAt: !46)
|
| 1173 |
+
!49 = !DILocation(line: 75, column: 32, scope: !24, inlinedAt: !46)
|
| 1174 |
+
!50 = !DILocation(line: 74, column: 34, scope: !24, inlinedAt: !46)
|
| 1175 |
+
!51 = !DILocation(line: 75, column: 47, scope: !24, inlinedAt: !46)
|
| 1176 |
+
!52 = !DILocation(line: 53, column: 20, scope: !5)
|
| 1177 |
+
!53 = !DILocation(line: 55, column: 20, scope: !5)
|
| 1178 |
+
!54 = !DILocation(line: 56, column: 49, scope: !5)
|
| 1179 |
+
!55 = !DILocation(line: 56, column: 75, scope: !5)
|
| 1180 |
+
!56 = !DILocation(line: 56, column: 66, scope: !5)
|
| 1181 |
+
!57 = !DILocation(line: 56, scope: !5)
|
| 1182 |
+
!58 = !DILocation(line: 56, column: 57, scope: !5)
|
| 1183 |
+
!59 = !DILocation(line: 56, column: 34, scope: !5)
|
| 1184 |
+
!60 = !DILocation(line: 56, column: 25, scope: !5)
|
| 1185 |
+
!61 = !DILocation(line: 56, column: 89, scope: !5)
|
| 1186 |
+
!62 = !DILocation(line: 57, column: 25, scope: !5)
|
| 1187 |
+
!63 = !DILocation(line: 57, column: 36, scope: !5)
|
| 1188 |
+
!64 = !DILocation(line: 58, column: 25, scope: !5)
|
| 1189 |
+
!65 = !DILocation(line: 58, column: 37, scope: !5)
|
| 1190 |
+
!66 = !DILocation(line: 58, column: 4, scope: !5)
|
progress/SpecForge/cache/compiled_kernels/triton/2/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.ptx
ADDED
|
@@ -0,0 +1,1141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// Generated by LLVM NVPTX Back-End
|
| 3 |
+
//
|
| 4 |
+
|
| 5 |
+
.version 8.7
|
| 6 |
+
.target sm_90a
|
| 7 |
+
.address_size 64
|
| 8 |
+
|
| 9 |
+
// .globl triton_per_fused_mul_1 // -- Begin function triton_per_fused_mul_1
|
| 10 |
+
.extern .shared .align 16 .b8 global_smem[];
|
| 11 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
|
| 12 |
+
// @triton_per_fused_mul_1
|
| 13 |
+
.visible .entry triton_per_fused_mul_1(
|
| 14 |
+
.param .u64 .ptr .global .align 1 triton_per_fused_mul_1_param_0,
|
| 15 |
+
.param .u64 .ptr .global .align 1 triton_per_fused_mul_1_param_1,
|
| 16 |
+
.param .u64 .ptr .global .align 1 triton_per_fused_mul_1_param_2,
|
| 17 |
+
.param .u64 .ptr .global .align 1 triton_per_fused_mul_1_param_3,
|
| 18 |
+
.param .u64 .ptr .global .align 1 triton_per_fused_mul_1_param_4,
|
| 19 |
+
.param .u64 triton_per_fused_mul_1_param_5,
|
| 20 |
+
.param .u32 triton_per_fused_mul_1_param_6,
|
| 21 |
+
.param .u32 triton_per_fused_mul_1_param_7,
|
| 22 |
+
.param .u64 .ptr .global .align 1 triton_per_fused_mul_1_param_8,
|
| 23 |
+
.param .u64 .ptr .global .align 1 triton_per_fused_mul_1_param_9
|
| 24 |
+
)
|
| 25 |
+
.reqntid 256
|
| 26 |
+
{
|
| 27 |
+
.reg .pred %p<110>;
|
| 28 |
+
.reg .b32 %r<491>;
|
| 29 |
+
.reg .b64 %rd<57>;
|
| 30 |
+
.loc 1 18 0 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:18:0
|
| 31 |
+
$L__func_begin0:
|
| 32 |
+
.loc 1 18 0 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:18:0
|
| 33 |
+
|
| 34 |
+
// %bb.0: // %__nv_exp2f.exit
|
| 35 |
+
ld.param.b32 %r26, [triton_per_fused_mul_1_param_6];
|
| 36 |
+
ld.param.b64 %rd13, [triton_per_fused_mul_1_param_5];
|
| 37 |
+
ld.param.b64 %rd22, [triton_per_fused_mul_1_param_0];
|
| 38 |
+
ld.param.b64 %rd23, [triton_per_fused_mul_1_param_1];
|
| 39 |
+
$L__tmp0:
|
| 40 |
+
.loc 1 23 28 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:23:28
|
| 41 |
+
mov.u32 %r143, %ctaid.x;
|
| 42 |
+
.loc 1 23 33 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:23:33
|
| 43 |
+
shl.b32 %r1, %r143, 7;
|
| 44 |
+
.loc 1 24 44 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:24:44
|
| 45 |
+
mov.u32 %r2, %tid.x;
|
| 46 |
+
shr.u32 %r144, %r2, 5;
|
| 47 |
+
and.b32 %r145, %r2, 31;
|
| 48 |
+
shl.b32 %r146, %r145, 2;
|
| 49 |
+
.loc 1 24 23 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:24:23
|
| 50 |
+
or.b32 %r147, %r146, %r1;
|
| 51 |
+
.loc 1 25 21 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:25:21
|
| 52 |
+
setp.lt.s32 %p1, %r147, %r26;
|
| 53 |
+
.loc 1 26 38 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:26:38
|
| 54 |
+
bfe.u32 %r148, %r2, 5, 3;
|
| 55 |
+
or.b32 %r149, %r148, 8;
|
| 56 |
+
or.b32 %r150, %r148, 16;
|
| 57 |
+
or.b32 %r151, %r144, 24;
|
| 58 |
+
.loc 1 33 19 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:33:19
|
| 59 |
+
cvt.s64.s32 %rd24, %r147;
|
| 60 |
+
.loc 1 35 38 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:35:38
|
| 61 |
+
shl.b64 %rd25, %rd13, 5;
|
| 62 |
+
.loc 1 35 42 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:35:42
|
| 63 |
+
cvt.u64.u32 %rd26, %r148;
|
| 64 |
+
cvt.u64.u32 %rd27, %r149;
|
| 65 |
+
cvt.u64.u32 %rd28, %r150;
|
| 66 |
+
cvt.u64.u32 %rd29, %r151;
|
| 67 |
+
.loc 1 35 35 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:35:35
|
| 68 |
+
mad.lo.s64 %rd30, %rd25, %rd26, %rd24;
|
| 69 |
+
mad.lo.s64 %rd31, %rd25, %rd27, %rd24;
|
| 70 |
+
mad.lo.s64 %rd32, %rd25, %rd28, %rd24;
|
| 71 |
+
mad.lo.s64 %rd33, %rd25, %rd29, %rd24;
|
| 72 |
+
.loc 1 35 30 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:35:30
|
| 73 |
+
shl.b64 %rd34, %rd30, 2;
|
| 74 |
+
add.s64 %rd14, %rd22, %rd34;
|
| 75 |
+
shl.b64 %rd35, %rd31, 2;
|
| 76 |
+
add.s64 %rd15, %rd22, %rd35;
|
| 77 |
+
shl.b64 %rd36, %rd32, 2;
|
| 78 |
+
add.s64 %rd16, %rd22, %rd36;
|
| 79 |
+
shl.b64 %rd37, %rd33, 2;
|
| 80 |
+
add.s64 %rd17, %rd22, %rd37;
|
| 81 |
+
mov.b32 %r31, 0;
|
| 82 |
+
.loc 1 35 49 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:35:49
|
| 83 |
+
// begin inline asm
|
| 84 |
+
mov.u32 %r27, %r31;
|
| 85 |
+
mov.u32 %r28, %r31;
|
| 86 |
+
mov.u32 %r29, %r31;
|
| 87 |
+
mov.u32 %r30, %r31;
|
| 88 |
+
@%p1 ld.global.v4.b32 { %r27, %r28, %r29, %r30 }, [ %rd14 + 0 ];
|
| 89 |
+
// end inline asm
|
| 90 |
+
// begin inline asm
|
| 91 |
+
mov.u32 %r35, %r31;
|
| 92 |
+
mov.u32 %r36, %r31;
|
| 93 |
+
mov.u32 %r37, %r31;
|
| 94 |
+
mov.u32 %r38, %r31;
|
| 95 |
+
@%p1 ld.global.v4.b32 { %r35, %r36, %r37, %r38 }, [ %rd15 + 0 ];
|
| 96 |
+
// end inline asm
|
| 97 |
+
// begin inline asm
|
| 98 |
+
mov.u32 %r43, %r31;
|
| 99 |
+
mov.u32 %r44, %r31;
|
| 100 |
+
mov.u32 %r45, %r31;
|
| 101 |
+
mov.u32 %r46, %r31;
|
| 102 |
+
@%p1 ld.global.v4.b32 { %r43, %r44, %r45, %r46 }, [ %rd16 + 0 ];
|
| 103 |
+
// end inline asm
|
| 104 |
+
// begin inline asm
|
| 105 |
+
mov.u32 %r51, %r31;
|
| 106 |
+
mov.u32 %r52, %r31;
|
| 107 |
+
mov.u32 %r53, %r31;
|
| 108 |
+
mov.u32 %r54, %r31;
|
| 109 |
+
@%p1 ld.global.v4.b32 { %r51, %r52, %r53, %r54 }, [ %rd17 + 0 ];
|
| 110 |
+
// end inline asm
|
| 111 |
+
.loc 1 36 30 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:36:30
|
| 112 |
+
add.s64 %rd18, %rd23, %rd34;
|
| 113 |
+
add.s64 %rd19, %rd23, %rd35;
|
| 114 |
+
add.s64 %rd20, %rd23, %rd36;
|
| 115 |
+
add.s64 %rd21, %rd23, %rd37;
|
| 116 |
+
.loc 1 36 49 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:36:49
|
| 117 |
+
// begin inline asm
|
| 118 |
+
mov.u32 %r59, %r31;
|
| 119 |
+
mov.u32 %r60, %r31;
|
| 120 |
+
mov.u32 %r61, %r31;
|
| 121 |
+
mov.u32 %r62, %r31;
|
| 122 |
+
@%p1 ld.global.v4.b32 { %r59, %r60, %r61, %r62 }, [ %rd18 + 0 ];
|
| 123 |
+
// end inline asm
|
| 124 |
+
// begin inline asm
|
| 125 |
+
mov.u32 %r67, %r31;
|
| 126 |
+
mov.u32 %r68, %r31;
|
| 127 |
+
mov.u32 %r69, %r31;
|
| 128 |
+
mov.u32 %r70, %r31;
|
| 129 |
+
@%p1 ld.global.v4.b32 { %r67, %r68, %r69, %r70 }, [ %rd19 + 0 ];
|
| 130 |
+
// end inline asm
|
| 131 |
+
// begin inline asm
|
| 132 |
+
mov.u32 %r75, %r31;
|
| 133 |
+
mov.u32 %r76, %r31;
|
| 134 |
+
mov.u32 %r77, %r31;
|
| 135 |
+
mov.u32 %r78, %r31;
|
| 136 |
+
@%p1 ld.global.v4.b32 { %r75, %r76, %r77, %r78 }, [ %rd20 + 0 ];
|
| 137 |
+
// end inline asm
|
| 138 |
+
// begin inline asm
|
| 139 |
+
mov.u32 %r83, %r31;
|
| 140 |
+
mov.u32 %r84, %r31;
|
| 141 |
+
mov.u32 %r85, %r31;
|
| 142 |
+
mov.u32 %r86, %r31;
|
| 143 |
+
@%p1 ld.global.v4.b32 { %r83, %r84, %r85, %r86 }, [ %rd21 + 0 ];
|
| 144 |
+
// end inline asm
|
| 145 |
+
.loc 1 38 33 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:38:33
|
| 146 |
+
selp.f32 %r152, %r27, 0fFF800000, %p1;
|
| 147 |
+
selp.f32 %r153, %r28, 0fFF800000, %p1;
|
| 148 |
+
selp.f32 %r154, %r29, 0fFF800000, %p1;
|
| 149 |
+
selp.f32 %r155, %r30, 0fFF800000, %p1;
|
| 150 |
+
selp.f32 %r156, %r35, 0fFF800000, %p1;
|
| 151 |
+
selp.f32 %r157, %r36, 0fFF800000, %p1;
|
| 152 |
+
selp.f32 %r158, %r37, 0fFF800000, %p1;
|
| 153 |
+
selp.f32 %r159, %r38, 0fFF800000, %p1;
|
| 154 |
+
selp.f32 %r160, %r43, 0fFF800000, %p1;
|
| 155 |
+
selp.f32 %r161, %r44, 0fFF800000, %p1;
|
| 156 |
+
selp.f32 %r162, %r45, 0fFF800000, %p1;
|
| 157 |
+
selp.f32 %r163, %r46, 0fFF800000, %p1;
|
| 158 |
+
selp.f32 %r164, %r51, 0fFF800000, %p1;
|
| 159 |
+
selp.f32 %r165, %r52, 0fFF800000, %p1;
|
| 160 |
+
selp.f32 %r166, %r53, 0fFF800000, %p1;
|
| 161 |
+
selp.f32 %r167, %r54, 0fFF800000, %p1;
|
| 162 |
+
$L__tmp1:
|
| 163 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 164 |
+
setp.gt.f32 %p33, %r152, %r156;
|
| 165 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 166 |
+
setp.nan.f32 %p34, %r152, %r152;
|
| 167 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 168 |
+
selp.f32 %r168, %r152, %r156, %p33;
|
| 169 |
+
selp.f32 %r169, %r152, %r168, %p34;
|
| 170 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 171 |
+
setp.gt.f32 %p35, %r153, %r157;
|
| 172 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 173 |
+
setp.nan.f32 %p36, %r153, %r153;
|
| 174 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 175 |
+
selp.f32 %r170, %r153, %r157, %p35;
|
| 176 |
+
selp.f32 %r171, %r153, %r170, %p36;
|
| 177 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 178 |
+
setp.gt.f32 %p37, %r154, %r158;
|
| 179 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 180 |
+
setp.nan.f32 %p38, %r154, %r154;
|
| 181 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 182 |
+
selp.f32 %r172, %r154, %r158, %p37;
|
| 183 |
+
selp.f32 %r173, %r154, %r172, %p38;
|
| 184 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 185 |
+
setp.gt.f32 %p39, %r155, %r159;
|
| 186 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 187 |
+
setp.nan.f32 %p40, %r155, %r155;
|
| 188 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 189 |
+
selp.f32 %r174, %r155, %r159, %p39;
|
| 190 |
+
selp.f32 %r175, %r155, %r174, %p40;
|
| 191 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 192 |
+
setp.gt.f32 %p41, %r169, %r160;
|
| 193 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 194 |
+
setp.nan.f32 %p42, %r169, %r169;
|
| 195 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 196 |
+
selp.f32 %r176, %r169, %r160, %p42;
|
| 197 |
+
selp.f32 %r177, %r169, %r176, %p41;
|
| 198 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 199 |
+
setp.gt.f32 %p43, %r171, %r161;
|
| 200 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 201 |
+
setp.nan.f32 %p44, %r171, %r171;
|
| 202 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 203 |
+
selp.f32 %r178, %r171, %r161, %p44;
|
| 204 |
+
selp.f32 %r179, %r171, %r178, %p43;
|
| 205 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 206 |
+
setp.gt.f32 %p45, %r173, %r162;
|
| 207 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 208 |
+
setp.nan.f32 %p46, %r173, %r173;
|
| 209 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 210 |
+
selp.f32 %r180, %r173, %r162, %p46;
|
| 211 |
+
selp.f32 %r181, %r173, %r180, %p45;
|
| 212 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 213 |
+
setp.gt.f32 %p47, %r175, %r163;
|
| 214 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 215 |
+
setp.nan.f32 %p48, %r175, %r175;
|
| 216 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 217 |
+
selp.f32 %r182, %r175, %r163, %p48;
|
| 218 |
+
selp.f32 %r183, %r175, %r182, %p47;
|
| 219 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 220 |
+
setp.gt.f32 %p49, %r177, %r164;
|
| 221 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 222 |
+
setp.nan.f32 %p50, %r177, %r177;
|
| 223 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 224 |
+
selp.f32 %r184, %r177, %r164, %p50;
|
| 225 |
+
selp.f32 %r92, %r177, %r184, %p49;
|
| 226 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 227 |
+
setp.gt.f32 %p51, %r179, %r165;
|
| 228 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 229 |
+
setp.nan.f32 %p52, %r179, %r179;
|
| 230 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 231 |
+
selp.f32 %r185, %r179, %r165, %p52;
|
| 232 |
+
selp.f32 %r94, %r179, %r185, %p51;
|
| 233 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 234 |
+
setp.gt.f32 %p53, %r181, %r166;
|
| 235 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 236 |
+
setp.nan.f32 %p54, %r181, %r181;
|
| 237 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 238 |
+
selp.f32 %r186, %r181, %r166, %p54;
|
| 239 |
+
selp.f32 %r96, %r181, %r186, %p53;
|
| 240 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 241 |
+
setp.gt.f32 %p55, %r183, %r167;
|
| 242 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 243 |
+
setp.nan.f32 %p56, %r183, %r183;
|
| 244 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 245 |
+
selp.f32 %r187, %r183, %r167, %p56;
|
| 246 |
+
selp.f32 %r98, %r183, %r187, %p55;
|
| 247 |
+
.loc 2 123 29 // triton_helpers.py:123:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 248 |
+
shl.b32 %r188, %r145, 7;
|
| 249 |
+
mov.b32 %r189, global_smem;
|
| 250 |
+
add.s32 %r190, %r189, %r188;
|
| 251 |
+
shl.b32 %r191, %r148, 2;
|
| 252 |
+
add.s32 %r91, %r190, %r191;
|
| 253 |
+
mov.pred %p9, -1;
|
| 254 |
+
// begin inline asm
|
| 255 |
+
@%p9 st.shared.b32 [ %r91 + 0 ], %r92;
|
| 256 |
+
// end inline asm
|
| 257 |
+
add.s32 %r93, %r91, 32;
|
| 258 |
+
// begin inline asm
|
| 259 |
+
@%p9 st.shared.b32 [ %r93 + 0 ], %r94;
|
| 260 |
+
// end inline asm
|
| 261 |
+
add.s32 %r95, %r91, 64;
|
| 262 |
+
// begin inline asm
|
| 263 |
+
@%p9 st.shared.b32 [ %r95 + 0 ], %r96;
|
| 264 |
+
// end inline asm
|
| 265 |
+
add.s32 %r97, %r91, 96;
|
| 266 |
+
// begin inline asm
|
| 267 |
+
@%p9 st.shared.b32 [ %r97 + 0 ], %r98;
|
| 268 |
+
// end inline asm
|
| 269 |
+
bar.sync 0;
|
| 270 |
+
shl.b32 %r192, %r2, 2;
|
| 271 |
+
add.s32 %r100, %r189, %r192;
|
| 272 |
+
// begin inline asm
|
| 273 |
+
@%p9 ld.shared.b32 %r99, [ %r100 + 0 ];
|
| 274 |
+
// end inline asm
|
| 275 |
+
shfl.sync.bfly.b32 %r193, %r99, 4, 31, -1;
|
| 276 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 277 |
+
setp.gt.f32 %p57, %r99, %r193;
|
| 278 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 279 |
+
setp.nan.f32 %p58, %r99, %r99;
|
| 280 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 281 |
+
selp.f32 %r194, %r99, %r193, %p57;
|
| 282 |
+
selp.f32 %r195, %r99, %r194, %p58;
|
| 283 |
+
.loc 2 123 29 // triton_helpers.py:123:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 284 |
+
shfl.sync.bfly.b32 %r196, %r195, 2, 31, -1;
|
| 285 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 286 |
+
setp.gt.f32 %p59, %r195, %r196;
|
| 287 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 288 |
+
setp.nan.f32 %p60, %r195, %r195;
|
| 289 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 290 |
+
selp.f32 %r197, %r195, %r196, %p60;
|
| 291 |
+
selp.f32 %r198, %r195, %r197, %p59;
|
| 292 |
+
.loc 2 123 29 // triton_helpers.py:123:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 293 |
+
shfl.sync.bfly.b32 %r199, %r198, 1, 31, -1;
|
| 294 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 295 |
+
setp.gt.f32 %p61, %r198, %r199;
|
| 296 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 297 |
+
setp.nan.f32 %p62, %r198, %r198;
|
| 298 |
+
.loc 2 123 29 // triton_helpers.py:123:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 299 |
+
and.b32 %r200, %r2, 7;
|
| 300 |
+
setp.eq.b32 %p14, %r200, 0;
|
| 301 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 302 |
+
selp.b32 %r201, %r198, %r199, %p62;
|
| 303 |
+
selp.b32 %r102, %r198, %r201, %p61;
|
| 304 |
+
.loc 2 123 29 // triton_helpers.py:123:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 305 |
+
// begin inline asm
|
| 306 |
+
@%p14 st.shared.b32 [ %r100 + 0 ], %r102;
|
| 307 |
+
// end inline asm
|
| 308 |
+
add.s32 %r104, %r100, 1024;
|
| 309 |
+
// begin inline asm
|
| 310 |
+
@%p9 ld.shared.b32 %r103, [ %r104 + 0 ];
|
| 311 |
+
// end inline asm
|
| 312 |
+
shfl.sync.bfly.b32 %r202, %r103, 4, 31, -1;
|
| 313 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 314 |
+
setp.gt.f32 %p63, %r103, %r202;
|
| 315 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 316 |
+
setp.nan.f32 %p64, %r103, %r103;
|
| 317 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 318 |
+
selp.f32 %r203, %r103, %r202, %p63;
|
| 319 |
+
selp.f32 %r204, %r103, %r203, %p64;
|
| 320 |
+
.loc 2 123 29 // triton_helpers.py:123:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 321 |
+
shfl.sync.bfly.b32 %r205, %r204, 2, 31, -1;
|
| 322 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 323 |
+
setp.gt.f32 %p65, %r204, %r205;
|
| 324 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 325 |
+
setp.nan.f32 %p66, %r204, %r204;
|
| 326 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 327 |
+
selp.f32 %r206, %r204, %r205, %p66;
|
| 328 |
+
selp.f32 %r207, %r204, %r206, %p65;
|
| 329 |
+
.loc 2 123 29 // triton_helpers.py:123:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 330 |
+
shfl.sync.bfly.b32 %r208, %r207, 1, 31, -1;
|
| 331 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 332 |
+
setp.gt.f32 %p67, %r207, %r208;
|
| 333 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 334 |
+
setp.nan.f32 %p68, %r207, %r207;
|
| 335 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 336 |
+
selp.b32 %r209, %r207, %r208, %p68;
|
| 337 |
+
selp.b32 %r106, %r207, %r209, %p67;
|
| 338 |
+
.loc 2 123 29 // triton_helpers.py:123:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 339 |
+
// begin inline asm
|
| 340 |
+
@%p14 st.shared.b32 [ %r104 + 0 ], %r106;
|
| 341 |
+
// end inline asm
|
| 342 |
+
add.s32 %r108, %r100, 2048;
|
| 343 |
+
// begin inline asm
|
| 344 |
+
@%p9 ld.shared.b32 %r107, [ %r108 + 0 ];
|
| 345 |
+
// end inline asm
|
| 346 |
+
shfl.sync.bfly.b32 %r210, %r107, 4, 31, -1;
|
| 347 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 348 |
+
setp.gt.f32 %p69, %r107, %r210;
|
| 349 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 350 |
+
setp.nan.f32 %p70, %r107, %r107;
|
| 351 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 352 |
+
selp.f32 %r211, %r107, %r210, %p69;
|
| 353 |
+
selp.f32 %r212, %r107, %r211, %p70;
|
| 354 |
+
.loc 2 123 29 // triton_helpers.py:123:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 355 |
+
shfl.sync.bfly.b32 %r213, %r212, 2, 31, -1;
|
| 356 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 357 |
+
setp.gt.f32 %p71, %r212, %r213;
|
| 358 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 359 |
+
setp.nan.f32 %p72, %r212, %r212;
|
| 360 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 361 |
+
selp.f32 %r214, %r212, %r213, %p72;
|
| 362 |
+
selp.f32 %r215, %r212, %r214, %p71;
|
| 363 |
+
.loc 2 123 29 // triton_helpers.py:123:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 364 |
+
shfl.sync.bfly.b32 %r216, %r215, 1, 31, -1;
|
| 365 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 366 |
+
setp.gt.f32 %p73, %r215, %r216;
|
| 367 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 368 |
+
setp.nan.f32 %p74, %r215, %r215;
|
| 369 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 370 |
+
selp.b32 %r217, %r215, %r216, %p74;
|
| 371 |
+
selp.b32 %r110, %r215, %r217, %p73;
|
| 372 |
+
.loc 2 123 29 // triton_helpers.py:123:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 373 |
+
// begin inline asm
|
| 374 |
+
@%p14 st.shared.b32 [ %r108 + 0 ], %r110;
|
| 375 |
+
// end inline asm
|
| 376 |
+
add.s32 %r112, %r100, 3072;
|
| 377 |
+
// begin inline asm
|
| 378 |
+
@%p9 ld.shared.b32 %r111, [ %r112 + 0 ];
|
| 379 |
+
// end inline asm
|
| 380 |
+
shfl.sync.bfly.b32 %r218, %r111, 4, 31, -1;
|
| 381 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 382 |
+
setp.gt.f32 %p75, %r111, %r218;
|
| 383 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 384 |
+
setp.nan.f32 %p76, %r111, %r111;
|
| 385 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 386 |
+
selp.f32 %r219, %r111, %r218, %p75;
|
| 387 |
+
selp.f32 %r220, %r111, %r219, %p76;
|
| 388 |
+
.loc 2 123 29 // triton_helpers.py:123:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 389 |
+
shfl.sync.bfly.b32 %r221, %r220, 2, 31, -1;
|
| 390 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 391 |
+
setp.gt.f32 %p77, %r220, %r221;
|
| 392 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 393 |
+
setp.nan.f32 %p78, %r220, %r220;
|
| 394 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 395 |
+
selp.f32 %r222, %r220, %r221, %p78;
|
| 396 |
+
selp.f32 %r223, %r220, %r222, %p77;
|
| 397 |
+
.loc 2 123 29 // triton_helpers.py:123:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 398 |
+
shfl.sync.bfly.b32 %r224, %r223, 1, 31, -1;
|
| 399 |
+
.loc 2 110 15 // triton_helpers.py:110:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 400 |
+
setp.gt.f32 %p79, %r223, %r224;
|
| 401 |
+
.loc 2 112 21 // triton_helpers.py:112:21 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 402 |
+
setp.nan.f32 %p80, %r223, %r223;
|
| 403 |
+
.loc 2 113 29 // triton_helpers.py:113:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 404 |
+
selp.b32 %r225, %r223, %r224, %p80;
|
| 405 |
+
selp.b32 %r114, %r223, %r225, %p79;
|
| 406 |
+
.loc 2 123 29 // triton_helpers.py:123:29 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:37 ]
|
| 407 |
+
// begin inline asm
|
| 408 |
+
@%p14 st.shared.b32 [ %r112 + 0 ], %r114;
|
| 409 |
+
// end inline asm
|
| 410 |
+
bar.sync 0;
|
| 411 |
+
ld.shared.b32 %r226, [%r190];
|
| 412 |
+
ld.shared.b32 %r227, [%r190+32];
|
| 413 |
+
ld.shared.b32 %r228, [%r190+64];
|
| 414 |
+
ld.shared.b32 %r229, [%r190+96];
|
| 415 |
+
$L__tmp2:
|
| 416 |
+
.loc 1 39 40 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:39:40
|
| 417 |
+
bar.sync 0;
|
| 418 |
+
mad.lo.s32 %r3, %r145, -112, %r190;
|
| 419 |
+
mov.b64 %rd2, {%r228, %r229};
|
| 420 |
+
mov.b64 %rd1, {%r226, %r227};
|
| 421 |
+
st.shared.v2.b64 [%r3], {%rd1, %rd2};
|
| 422 |
+
bar.sync 0;
|
| 423 |
+
shl.b32 %r230, %r144, 7;
|
| 424 |
+
shl.b32 %r231, %r2, 4;
|
| 425 |
+
or.b32 %r232, %r230, %r231;
|
| 426 |
+
and.b32 %r233, %r232, 496;
|
| 427 |
+
add.s32 %r461, %r189, %r233;
|
| 428 |
+
// begin inline asm
|
| 429 |
+
ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%r463}, [%r461];
|
| 430 |
+
// end inline asm
|
| 431 |
+
.loc 1 41 19 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:41:19
|
| 432 |
+
setp.eq.f32 %p81, %r226, 0fFF800000;
|
| 433 |
+
setp.eq.f32 %p82, %r227, 0fFF800000;
|
| 434 |
+
setp.eq.f32 %p83, %r228, 0fFF800000;
|
| 435 |
+
setp.eq.f32 %p84, %r229, 0fFF800000;
|
| 436 |
+
.loc 1 42 18 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:42:18
|
| 437 |
+
sub.f32 %r234, %r27, %r226;
|
| 438 |
+
sub.f32 %r235, %r28, %r227;
|
| 439 |
+
sub.f32 %r236, %r29, %r228;
|
| 440 |
+
sub.f32 %r237, %r30, %r229;
|
| 441 |
+
sub.f32 %r238, %r35, %r226;
|
| 442 |
+
sub.f32 %r239, %r36, %r227;
|
| 443 |
+
sub.f32 %r240, %r37, %r228;
|
| 444 |
+
sub.f32 %r241, %r38, %r229;
|
| 445 |
+
sub.f32 %r242, %r43, %r226;
|
| 446 |
+
sub.f32 %r243, %r44, %r227;
|
| 447 |
+
sub.f32 %r244, %r45, %r228;
|
| 448 |
+
sub.f32 %r245, %r46, %r229;
|
| 449 |
+
sub.f32 %r246, %r51, %r226;
|
| 450 |
+
sub.f32 %r247, %r52, %r227;
|
| 451 |
+
sub.f32 %r248, %r53, %r228;
|
| 452 |
+
sub.f32 %r249, %r54, %r229;
|
| 453 |
+
.loc 1 44 33 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:44:33
|
| 454 |
+
selp.f32 %r250, 0f00000000, %r234, %p81;
|
| 455 |
+
selp.f32 %r251, 0f00000000, %r235, %p82;
|
| 456 |
+
selp.f32 %r252, 0f00000000, %r236, %p83;
|
| 457 |
+
selp.f32 %r253, 0f00000000, %r237, %p84;
|
| 458 |
+
selp.f32 %r254, 0f00000000, %r238, %p81;
|
| 459 |
+
selp.f32 %r255, 0f00000000, %r239, %p82;
|
| 460 |
+
selp.f32 %r256, 0f00000000, %r240, %p83;
|
| 461 |
+
selp.f32 %r257, 0f00000000, %r241, %p84;
|
| 462 |
+
selp.f32 %r258, 0f00000000, %r242, %p81;
|
| 463 |
+
selp.f32 %r259, 0f00000000, %r243, %p82;
|
| 464 |
+
selp.f32 %r260, 0f00000000, %r244, %p83;
|
| 465 |
+
selp.f32 %r261, 0f00000000, %r245, %p84;
|
| 466 |
+
selp.f32 %r262, 0f00000000, %r246, %p81;
|
| 467 |
+
selp.f32 %r263, 0f00000000, %r247, %p82;
|
| 468 |
+
selp.f32 %r264, 0f00000000, %r248, %p83;
|
| 469 |
+
selp.f32 %r265, 0f00000000, %r249, %p84;
|
| 470 |
+
.loc 1 45 27 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:45:27
|
| 471 |
+
ex2.approx.ftz.f32 %r266, %r250;
|
| 472 |
+
ex2.approx.ftz.f32 %r267, %r251;
|
| 473 |
+
ex2.approx.ftz.f32 %r268, %r252;
|
| 474 |
+
ex2.approx.ftz.f32 %r269, %r253;
|
| 475 |
+
ex2.approx.ftz.f32 %r270, %r254;
|
| 476 |
+
ex2.approx.ftz.f32 %r271, %r255;
|
| 477 |
+
ex2.approx.ftz.f32 %r272, %r256;
|
| 478 |
+
ex2.approx.ftz.f32 %r273, %r257;
|
| 479 |
+
ex2.approx.ftz.f32 %r274, %r258;
|
| 480 |
+
ex2.approx.ftz.f32 %r275, %r259;
|
| 481 |
+
ex2.approx.ftz.f32 %r276, %r260;
|
| 482 |
+
ex2.approx.ftz.f32 %r277, %r261;
|
| 483 |
+
ex2.approx.ftz.f32 %r278, %r262;
|
| 484 |
+
ex2.approx.ftz.f32 %r279, %r263;
|
| 485 |
+
ex2.approx.ftz.f32 %r280, %r264;
|
| 486 |
+
ex2.approx.ftz.f32 %r281, %r265;
|
| 487 |
+
.loc 1 46 19 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:46:19
|
| 488 |
+
mul.f32 %r282, %r270, %r67;
|
| 489 |
+
mul.f32 %r283, %r271, %r68;
|
| 490 |
+
mul.f32 %r284, %r272, %r69;
|
| 491 |
+
mul.f32 %r285, %r273, %r70;
|
| 492 |
+
$L__tmp3:
|
| 493 |
+
.loc 3 291 36 // standard.py:291:36 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 494 |
+
bar.sync 0;
|
| 495 |
+
.loc 3 261 15 // standard.py:261:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 496 |
+
fma.rn.f32 %r286, %r266, %r59, %r282;
|
| 497 |
+
fma.rn.f32 %r287, %r267, %r60, %r283;
|
| 498 |
+
fma.rn.f32 %r288, %r268, %r61, %r284;
|
| 499 |
+
fma.rn.f32 %r289, %r269, %r62, %r285;
|
| 500 |
+
fma.rn.f32 %r290, %r274, %r75, %r286;
|
| 501 |
+
fma.rn.f32 %r291, %r275, %r76, %r287;
|
| 502 |
+
fma.rn.f32 %r292, %r276, %r77, %r288;
|
| 503 |
+
fma.rn.f32 %r293, %r277, %r78, %r289;
|
| 504 |
+
fma.rn.f32 %r294, %r278, %r83, %r290;
|
| 505 |
+
fma.rn.f32 %r295, %r279, %r84, %r291;
|
| 506 |
+
fma.rn.f32 %r296, %r280, %r85, %r292;
|
| 507 |
+
fma.rn.f32 %r297, %r281, %r86, %r293;
|
| 508 |
+
selp.b32 %r118, %r294, 0, %p1;
|
| 509 |
+
.loc 3 291 36 // standard.py:291:36 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 510 |
+
// begin inline asm
|
| 511 |
+
@%p9 st.shared.b32 [ %r91 + 0 ], %r118;
|
| 512 |
+
// end inline asm
|
| 513 |
+
.loc 3 261 15 // standard.py:261:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 514 |
+
selp.b32 %r120, %r295, 0, %p1;
|
| 515 |
+
.loc 3 291 36 // standard.py:291:36 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 516 |
+
// begin inline asm
|
| 517 |
+
@%p9 st.shared.b32 [ %r93 + 0 ], %r120;
|
| 518 |
+
// end inline asm
|
| 519 |
+
.loc 3 261 15 // standard.py:261:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 520 |
+
selp.b32 %r122, %r296, 0, %p1;
|
| 521 |
+
.loc 3 291 36 // standard.py:291:36 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 522 |
+
// begin inline asm
|
| 523 |
+
@%p9 st.shared.b32 [ %r95 + 0 ], %r122;
|
| 524 |
+
// end inline asm
|
| 525 |
+
.loc 3 261 15 // standard.py:261:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 526 |
+
selp.b32 %r124, %r297, 0, %p1;
|
| 527 |
+
.loc 3 291 36 // standard.py:291:36 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 528 |
+
// begin inline asm
|
| 529 |
+
@%p9 st.shared.b32 [ %r97 + 0 ], %r124;
|
| 530 |
+
// end inline asm
|
| 531 |
+
bar.sync 0;
|
| 532 |
+
// begin inline asm
|
| 533 |
+
@%p9 ld.shared.b32 %r125, [ %r100 + 0 ];
|
| 534 |
+
// end inline asm
|
| 535 |
+
shfl.sync.bfly.b32 %r298, %r125, 4, 31, -1;
|
| 536 |
+
.loc 3 261 15 // standard.py:261:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 537 |
+
add.f32 %r299, %r125, %r298;
|
| 538 |
+
.loc 3 291 36 // standard.py:291:36 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 539 |
+
shfl.sync.bfly.b32 %r300, %r299, 2, 31, -1;
|
| 540 |
+
.loc 3 261 15 // standard.py:261:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 541 |
+
add.f32 %r301, %r299, %r300;
|
| 542 |
+
.loc 3 291 36 // standard.py:291:36 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 543 |
+
shfl.sync.bfly.b32 %r302, %r301, 1, 31, -1;
|
| 544 |
+
.loc 3 261 15 // standard.py:261:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 545 |
+
add.f32 %r128, %r301, %r302;
|
| 546 |
+
.loc 3 291 36 // standard.py:291:36 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 547 |
+
// begin inline asm
|
| 548 |
+
@%p14 st.shared.b32 [ %r100 + 0 ], %r128;
|
| 549 |
+
// end inline asm
|
| 550 |
+
// begin inline asm
|
| 551 |
+
@%p9 ld.shared.b32 %r129, [ %r104 + 0 ];
|
| 552 |
+
// end inline asm
|
| 553 |
+
shfl.sync.bfly.b32 %r303, %r129, 4, 31, -1;
|
| 554 |
+
.loc 3 261 15 // standard.py:261:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 555 |
+
add.f32 %r304, %r129, %r303;
|
| 556 |
+
.loc 3 291 36 // standard.py:291:36 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 557 |
+
shfl.sync.bfly.b32 %r305, %r304, 2, 31, -1;
|
| 558 |
+
.loc 3 261 15 // standard.py:261:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 559 |
+
add.f32 %r306, %r304, %r305;
|
| 560 |
+
.loc 3 291 36 // standard.py:291:36 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 561 |
+
shfl.sync.bfly.b32 %r307, %r306, 1, 31, -1;
|
| 562 |
+
.loc 3 261 15 // standard.py:261:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 563 |
+
add.f32 %r132, %r306, %r307;
|
| 564 |
+
.loc 3 291 36 // standard.py:291:36 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 565 |
+
// begin inline asm
|
| 566 |
+
@%p14 st.shared.b32 [ %r104 + 0 ], %r132;
|
| 567 |
+
// end inline asm
|
| 568 |
+
// begin inline asm
|
| 569 |
+
@%p9 ld.shared.b32 %r133, [ %r108 + 0 ];
|
| 570 |
+
// end inline asm
|
| 571 |
+
shfl.sync.bfly.b32 %r308, %r133, 4, 31, -1;
|
| 572 |
+
.loc 3 261 15 // standard.py:261:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 573 |
+
add.f32 %r309, %r133, %r308;
|
| 574 |
+
.loc 3 291 36 // standard.py:291:36 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 575 |
+
shfl.sync.bfly.b32 %r310, %r309, 2, 31, -1;
|
| 576 |
+
.loc 3 261 15 // standard.py:261:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 577 |
+
add.f32 %r311, %r309, %r310;
|
| 578 |
+
.loc 3 291 36 // standard.py:291:36 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 579 |
+
shfl.sync.bfly.b32 %r312, %r311, 1, 31, -1;
|
| 580 |
+
.loc 3 261 15 // standard.py:261:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 581 |
+
add.f32 %r136, %r311, %r312;
|
| 582 |
+
.loc 3 291 36 // standard.py:291:36 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 583 |
+
// begin inline asm
|
| 584 |
+
@%p14 st.shared.b32 [ %r108 + 0 ], %r136;
|
| 585 |
+
// end inline asm
|
| 586 |
+
// begin inline asm
|
| 587 |
+
@%p9 ld.shared.b32 %r137, [ %r112 + 0 ];
|
| 588 |
+
// end inline asm
|
| 589 |
+
shfl.sync.bfly.b32 %r313, %r137, 4, 31, -1;
|
| 590 |
+
.loc 3 261 15 // standard.py:261:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 591 |
+
add.f32 %r314, %r137, %r313;
|
| 592 |
+
.loc 3 291 36 // standard.py:291:36 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 593 |
+
shfl.sync.bfly.b32 %r315, %r314, 2, 31, -1;
|
| 594 |
+
.loc 3 261 15 // standard.py:261:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 595 |
+
add.f32 %r316, %r314, %r315;
|
| 596 |
+
.loc 3 291 36 // standard.py:291:36 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 597 |
+
shfl.sync.bfly.b32 %r317, %r316, 1, 31, -1;
|
| 598 |
+
.loc 3 261 15 // standard.py:261:15 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 599 |
+
add.f32 %r140, %r316, %r317;
|
| 600 |
+
.loc 3 291 36 // standard.py:291:36 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:26 ]
|
| 601 |
+
// begin inline asm
|
| 602 |
+
@%p14 st.shared.b32 [ %r112 + 0 ], %r140;
|
| 603 |
+
// end inline asm
|
| 604 |
+
bar.sync 0;
|
| 605 |
+
ld.shared.b32 %r318, [%r190];
|
| 606 |
+
ld.shared.b32 %r319, [%r190+32];
|
| 607 |
+
ld.shared.b32 %r320, [%r190+64];
|
| 608 |
+
ld.shared.b32 %r321, [%r190+96];
|
| 609 |
+
$L__tmp4:
|
| 610 |
+
.loc 1 49 29 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:49:29
|
| 611 |
+
bar.sync 0;
|
| 612 |
+
st.shared.v4.b32 [%r3], {%r318, %r319, %r320, %r321};
|
| 613 |
+
bar.sync 0;
|
| 614 |
+
// begin inline asm
|
| 615 |
+
ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%r464}, [%r461];
|
| 616 |
+
// end inline asm
|
| 617 |
+
.loc 1 51 34 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:51:34
|
| 618 |
+
selp.f32 %r322, 0f3F800000, %r318, %p81;
|
| 619 |
+
selp.f32 %r7, 0f3F800000, %r319, %p82;
|
| 620 |
+
.loc 1 52 27 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:52:27
|
| 621 |
+
setp.lt.f32 %p85, %r322, 0f00800000;
|
| 622 |
+
mul.f32 %r323, %r322, 0f4B000000;
|
| 623 |
+
selp.f32 %r10, %r323, %r322, %p85;
|
| 624 |
+
selp.f32 %r324, 0fC1B80000, 0f00000000, %p85;
|
| 625 |
+
add.s32 %r325, %r10, -1060439283;
|
| 626 |
+
and.b32 %r326, %r325, -8388608;
|
| 627 |
+
sub.s32 %r327, %r10, %r326;
|
| 628 |
+
cvt.rn.f32.s32 %r328, %r326;
|
| 629 |
+
mov.b32 %r329, 0f34000000;
|
| 630 |
+
fma.rn.ftz.f32 %r330, %r328, %r329, %r324;
|
| 631 |
+
add.f32 %r331, %r327, 0fBF800000;
|
| 632 |
+
mov.b32 %r332, 0fBE2C7F30;
|
| 633 |
+
mov.b32 %r333, 0f3DC6B27F;
|
| 634 |
+
fma.rn.ftz.f32 %r334, %r333, %r331, %r332;
|
| 635 |
+
mov.b32 %r335, 0f3E2FCF2A;
|
| 636 |
+
fma.rn.ftz.f32 %r336, %r334, %r331, %r335;
|
| 637 |
+
mov.b32 %r337, 0fBE374E43;
|
| 638 |
+
fma.rn.ftz.f32 %r338, %r336, %r331, %r337;
|
| 639 |
+
mov.b32 %r339, 0f3E520BF4;
|
| 640 |
+
fma.rn.ftz.f32 %r340, %r338, %r331, %r339;
|
| 641 |
+
mov.b32 %r341, 0fBE763C8B;
|
| 642 |
+
fma.rn.ftz.f32 %r342, %r340, %r331, %r341;
|
| 643 |
+
mov.b32 %r343, 0f3E93BF99;
|
| 644 |
+
fma.rn.ftz.f32 %r344, %r342, %r331, %r343;
|
| 645 |
+
mov.b32 %r345, 0fBEB8AA49;
|
| 646 |
+
fma.rn.ftz.f32 %r346, %r344, %r331, %r345;
|
| 647 |
+
mov.b32 %r347, 0f3EF6384A;
|
| 648 |
+
fma.rn.ftz.f32 %r348, %r346, %r331, %r347;
|
| 649 |
+
mov.b32 %r349, 0fBF38AA3B;
|
| 650 |
+
fma.rn.ftz.f32 %r350, %r348, %r331, %r349;
|
| 651 |
+
mul.f32 %r351, %r331, %r350;
|
| 652 |
+
mul.f32 %r352, %r331, %r351;
|
| 653 |
+
mov.b32 %r353, 0f3FB8AA3B;
|
| 654 |
+
fma.rn.ftz.f32 %r354, %r331, %r353, %r352;
|
| 655 |
+
add.f32 %r487, %r330, %r354;
|
| 656 |
+
setp.lt.u32 %p86, %r10, 2139095040;
|
| 657 |
+
mov.b32 %r355, 0f7F800000;
|
| 658 |
+
@%p86 bra $L__BB0_2;
|
| 659 |
+
// %bb.1: // %__nv_fmaf_rn.exit.i.i
|
| 660 |
+
.loc 1 0 27 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:0:27
|
| 661 |
+
fma.rn.ftz.f32 %r487, %r10, %r355, %r355;
|
| 662 |
+
$L__BB0_2: // %__nv_log2f.exit
|
| 663 |
+
selp.f32 %r8, 0f3F800000, %r320, %p83;
|
| 664 |
+
.loc 1 52 27 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:52:27
|
| 665 |
+
setp.lt.f32 %p87, %r7, 0f00800000;
|
| 666 |
+
mul.f32 %r356, %r7, 0f4B000000;
|
| 667 |
+
selp.f32 %r14, %r356, %r7, %p87;
|
| 668 |
+
selp.f32 %r357, 0fC1B80000, 0f00000000, %p87;
|
| 669 |
+
add.s32 %r358, %r14, -1060439283;
|
| 670 |
+
and.b32 %r359, %r358, -8388608;
|
| 671 |
+
sub.s32 %r360, %r14, %r359;
|
| 672 |
+
cvt.rn.f32.s32 %r361, %r359;
|
| 673 |
+
fma.rn.ftz.f32 %r363, %r361, %r329, %r357;
|
| 674 |
+
add.f32 %r364, %r360, 0fBF800000;
|
| 675 |
+
fma.rn.ftz.f32 %r367, %r333, %r364, %r332;
|
| 676 |
+
fma.rn.ftz.f32 %r369, %r367, %r364, %r335;
|
| 677 |
+
fma.rn.ftz.f32 %r371, %r369, %r364, %r337;
|
| 678 |
+
fma.rn.ftz.f32 %r373, %r371, %r364, %r339;
|
| 679 |
+
fma.rn.ftz.f32 %r375, %r373, %r364, %r341;
|
| 680 |
+
fma.rn.ftz.f32 %r377, %r375, %r364, %r343;
|
| 681 |
+
fma.rn.ftz.f32 %r379, %r377, %r364, %r345;
|
| 682 |
+
fma.rn.ftz.f32 %r381, %r379, %r364, %r347;
|
| 683 |
+
fma.rn.ftz.f32 %r383, %r381, %r364, %r349;
|
| 684 |
+
mul.f32 %r384, %r364, %r383;
|
| 685 |
+
mul.f32 %r385, %r364, %r384;
|
| 686 |
+
fma.rn.ftz.f32 %r387, %r364, %r353, %r385;
|
| 687 |
+
add.f32 %r488, %r363, %r387;
|
| 688 |
+
setp.lt.u32 %p88, %r14, 2139095040;
|
| 689 |
+
@%p88 bra $L__BB0_4;
|
| 690 |
+
// %bb.3: // %__nv_fmaf_rn.exit.i.i74
|
| 691 |
+
.loc 1 0 27 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:0:27
|
| 692 |
+
fma.rn.ftz.f32 %r488, %r14, %r355, %r355;
|
| 693 |
+
$L__BB0_4: // %__nv_log2f.exit77
|
| 694 |
+
selp.f32 %r9, 0f3F800000, %r321, %p84;
|
| 695 |
+
.loc 1 52 27 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:52:27
|
| 696 |
+
setp.lt.f32 %p89, %r8, 0f00800000;
|
| 697 |
+
mul.f32 %r389, %r8, 0f4B000000;
|
| 698 |
+
selp.f32 %r18, %r389, %r8, %p89;
|
| 699 |
+
selp.f32 %r390, 0fC1B80000, 0f00000000, %p89;
|
| 700 |
+
add.s32 %r391, %r18, -1060439283;
|
| 701 |
+
and.b32 %r392, %r391, -8388608;
|
| 702 |
+
sub.s32 %r393, %r18, %r392;
|
| 703 |
+
cvt.rn.f32.s32 %r394, %r392;
|
| 704 |
+
mov.b32 %r395, 0f34000000;
|
| 705 |
+
fma.rn.ftz.f32 %r396, %r394, %r395, %r390;
|
| 706 |
+
add.f32 %r397, %r393, 0fBF800000;
|
| 707 |
+
mov.b32 %r398, 0fBE2C7F30;
|
| 708 |
+
mov.b32 %r399, 0f3DC6B27F;
|
| 709 |
+
fma.rn.ftz.f32 %r400, %r399, %r397, %r398;
|
| 710 |
+
mov.b32 %r401, 0f3E2FCF2A;
|
| 711 |
+
fma.rn.ftz.f32 %r402, %r400, %r397, %r401;
|
| 712 |
+
mov.b32 %r403, 0fBE374E43;
|
| 713 |
+
fma.rn.ftz.f32 %r404, %r402, %r397, %r403;
|
| 714 |
+
mov.b32 %r405, 0f3E520BF4;
|
| 715 |
+
fma.rn.ftz.f32 %r406, %r404, %r397, %r405;
|
| 716 |
+
mov.b32 %r407, 0fBE763C8B;
|
| 717 |
+
fma.rn.ftz.f32 %r408, %r406, %r397, %r407;
|
| 718 |
+
mov.b32 %r409, 0f3E93BF99;
|
| 719 |
+
fma.rn.ftz.f32 %r410, %r408, %r397, %r409;
|
| 720 |
+
mov.b32 %r411, 0fBEB8AA49;
|
| 721 |
+
fma.rn.ftz.f32 %r412, %r410, %r397, %r411;
|
| 722 |
+
mov.b32 %r413, 0f3EF6384A;
|
| 723 |
+
fma.rn.ftz.f32 %r414, %r412, %r397, %r413;
|
| 724 |
+
mov.b32 %r415, 0fBF38AA3B;
|
| 725 |
+
fma.rn.ftz.f32 %r416, %r414, %r397, %r415;
|
| 726 |
+
mul.f32 %r417, %r397, %r416;
|
| 727 |
+
mul.f32 %r418, %r397, %r417;
|
| 728 |
+
mov.b32 %r419, 0f3FB8AA3B;
|
| 729 |
+
fma.rn.ftz.f32 %r420, %r397, %r419, %r418;
|
| 730 |
+
add.f32 %r489, %r396, %r420;
|
| 731 |
+
setp.lt.u32 %p90, %r18, 2139095040;
|
| 732 |
+
mov.b32 %r421, 0f7F800000;
|
| 733 |
+
@%p90 bra $L__BB0_6;
|
| 734 |
+
// %bb.5: // %__nv_fmaf_rn.exit.i.i104
|
| 735 |
+
.loc 1 0 27 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:0:27
|
| 736 |
+
fma.rn.ftz.f32 %r489, %r18, %r421, %r421;
|
| 737 |
+
$L__BB0_6: // %__nv_log2f.exit107
|
| 738 |
+
.loc 1 52 27 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:52:27
|
| 739 |
+
setp.lt.f32 %p91, %r9, 0f00800000;
|
| 740 |
+
mul.f32 %r422, %r9, 0f4B000000;
|
| 741 |
+
selp.f32 %r22, %r422, %r9, %p91;
|
| 742 |
+
selp.f32 %r423, 0fC1B80000, 0f00000000, %p91;
|
| 743 |
+
add.s32 %r424, %r22, -1060439283;
|
| 744 |
+
and.b32 %r425, %r424, -8388608;
|
| 745 |
+
sub.s32 %r426, %r22, %r425;
|
| 746 |
+
cvt.rn.f32.s32 %r427, %r425;
|
| 747 |
+
fma.rn.ftz.f32 %r429, %r427, %r395, %r423;
|
| 748 |
+
add.f32 %r430, %r426, 0fBF800000;
|
| 749 |
+
fma.rn.ftz.f32 %r433, %r399, %r430, %r398;
|
| 750 |
+
fma.rn.ftz.f32 %r435, %r433, %r430, %r401;
|
| 751 |
+
fma.rn.ftz.f32 %r437, %r435, %r430, %r403;
|
| 752 |
+
fma.rn.ftz.f32 %r439, %r437, %r430, %r405;
|
| 753 |
+
fma.rn.ftz.f32 %r441, %r439, %r430, %r407;
|
| 754 |
+
fma.rn.ftz.f32 %r443, %r441, %r430, %r409;
|
| 755 |
+
fma.rn.ftz.f32 %r445, %r443, %r430, %r411;
|
| 756 |
+
fma.rn.ftz.f32 %r447, %r445, %r430, %r413;
|
| 757 |
+
fma.rn.ftz.f32 %r449, %r447, %r430, %r415;
|
| 758 |
+
mul.f32 %r450, %r430, %r449;
|
| 759 |
+
mul.f32 %r451, %r430, %r450;
|
| 760 |
+
fma.rn.ftz.f32 %r453, %r430, %r419, %r451;
|
| 761 |
+
add.f32 %r490, %r429, %r453;
|
| 762 |
+
setp.lt.u32 %p92, %r22, 2139095040;
|
| 763 |
+
@%p92 bra $L__BB0_8;
|
| 764 |
+
// %bb.7: // %__nv_fmaf_rn.exit.i.i134
|
| 765 |
+
.loc 1 0 27 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:0:27
|
| 766 |
+
fma.rn.ftz.f32 %r490, %r22, %r421, %r421;
|
| 767 |
+
$L__BB0_8: // %__nv_log2f.exit137
|
| 768 |
+
ld.param.b64 %rd12, [triton_per_fused_mul_1_param_4];
|
| 769 |
+
ld.param.b64 %rd11, [triton_per_fused_mul_1_param_3];
|
| 770 |
+
ld.param.b64 %rd10, [triton_per_fused_mul_1_param_2];
|
| 771 |
+
.loc 1 52 27 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:52:27
|
| 772 |
+
mov.b64 %rd4, {%r18, %r22};
|
| 773 |
+
mov.b64 %rd3, {%r10, %r14};
|
| 774 |
+
.loc 1 24 44 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:24:44
|
| 775 |
+
and.b32 %r455, %r2, 127;
|
| 776 |
+
.loc 1 24 23 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:24:23
|
| 777 |
+
or.b32 %r456, %r1, %r455;
|
| 778 |
+
.loc 1 33 19 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:33:19
|
| 779 |
+
cvt.s64.s32 %rd5, %r456;
|
| 780 |
+
$L__tmp5:
|
| 781 |
+
.loc 2 72 16 // triton_helpers.py:72:16 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:34:51 ]
|
| 782 |
+
or.b64 %rd38, %rd5, %rd13;
|
| 783 |
+
and.b64 %rd39, %rd38, -4294967296;
|
| 784 |
+
setp.ne.b64 %p93, %rd39, 0;
|
| 785 |
+
@%p93 bra $L__BB0_10;
|
| 786 |
+
bra.uni $L__BB0_9;
|
| 787 |
+
$L__BB0_10:
|
| 788 |
+
div.s64 %rd56, %rd5, %rd13;
|
| 789 |
+
bra.uni $L__BB0_11;
|
| 790 |
+
$L__BB0_9:
|
| 791 |
+
cvt.u32.u64 %r457, %rd13;
|
| 792 |
+
cvt.u32.u64 %r458, %rd5;
|
| 793 |
+
div.u32 %r459, %r458, %r457;
|
| 794 |
+
cvt.u64.u32 %rd56, %r459;
|
| 795 |
+
$L__tmp6:
|
| 796 |
+
$L__BB0_11:
|
| 797 |
+
.loc 2 0 16 // triton_helpers.py:0:16
|
| 798 |
+
cvt.u32.u64 %r465, %rd5;
|
| 799 |
+
.loc 1 52 27 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:52:27
|
| 800 |
+
mov.b64 {%r466, %r467}, %rd4;
|
| 801 |
+
setp.eq.f32 %p97, %r466, 0f00000000;
|
| 802 |
+
setp.eq.f32 %p98, %r467, 0f00000000;
|
| 803 |
+
mov.b64 {%r468, %r469}, %rd3;
|
| 804 |
+
setp.eq.f32 %p99, %r468, 0f00000000;
|
| 805 |
+
setp.eq.f32 %p100, %r469, 0f00000000;
|
| 806 |
+
$L__tmp7:
|
| 807 |
+
.loc 2 75 25 // triton_helpers.py:75:25 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:34:51 ]
|
| 808 |
+
setp.lt.s32 %p101, %r1, 0;
|
| 809 |
+
.loc 2 75 36 // triton_helpers.py:75:36 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:34:51 ]
|
| 810 |
+
setp.lt.s64 %p102, %rd13, 0;
|
| 811 |
+
.loc 2 75 32 // triton_helpers.py:75:32 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:34:51 ]
|
| 812 |
+
xor.pred %p103, %p101, %p102;
|
| 813 |
+
$L__tmp8:
|
| 814 |
+
.loc 1 33 19 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:33:19
|
| 815 |
+
mul.lo.s64 %rd44, %rd56, %rd13;
|
| 816 |
+
sub.s64 %rd45, %rd5, %rd44;
|
| 817 |
+
$L__tmp9:
|
| 818 |
+
.loc 2 74 34 // triton_helpers.py:74:34 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:34:51 ]
|
| 819 |
+
setp.ne.b64 %p104, %rd45, 0;
|
| 820 |
+
.loc 2 75 47 // triton_helpers.py:75:47 @[ c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:34:51 ]
|
| 821 |
+
and.pred %p105, %p103, %p104;
|
| 822 |
+
selp.b64 %rd46, -1, 0, %p105;
|
| 823 |
+
add.s64 %rd47, %rd56, %rd46;
|
| 824 |
+
$L__tmp10:
|
| 825 |
+
.loc 1 25 21 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:25:21
|
| 826 |
+
setp.lt.s32 %p106, %r465, %r26;
|
| 827 |
+
.loc 1 52 27 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:52:27
|
| 828 |
+
selp.f32 %r470, 0fFF800000, %r488, %p100;
|
| 829 |
+
selp.f32 %r471, 0fFF800000, %r487, %p99;
|
| 830 |
+
selp.f32 %r472, 0fFF800000, %r490, %p98;
|
| 831 |
+
selp.f32 %r473, 0fFF800000, %r489, %p97;
|
| 832 |
+
.loc 1 53 20 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:53:20
|
| 833 |
+
mov.b64 {%r474, %r475}, %rd2;
|
| 834 |
+
add.f32 %r476, %r474, %r473;
|
| 835 |
+
add.f32 %r477, %r475, %r472;
|
| 836 |
+
mov.b64 {%r478, %r479}, %rd1;
|
| 837 |
+
add.f32 %r480, %r478, %r471;
|
| 838 |
+
add.f32 %r481, %r479, %r470;
|
| 839 |
+
.loc 1 55 20 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:55:20
|
| 840 |
+
mul.f32 %r482, %r481, 0f3F317218;
|
| 841 |
+
mul.f32 %r483, %r480, 0f3F317218;
|
| 842 |
+
mul.f32 %r484, %r477, 0f3F317218;
|
| 843 |
+
mul.f32 %r485, %r476, 0f3F317218;
|
| 844 |
+
.loc 1 56 49 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:56:49
|
| 845 |
+
setp.lt.s64 %p107, %rd13, 2;
|
| 846 |
+
.loc 1 56 75 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:56:75
|
| 847 |
+
setp.gt.s64 %p108, %rd13, 1;
|
| 848 |
+
.loc 1 56 66 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:56:66
|
| 849 |
+
selp.b64 %rd48, %rd13, 0, %p108;
|
| 850 |
+
.loc 1 56 0 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:56
|
| 851 |
+
selp.b64 %rd49, 1, 0, %p107;
|
| 852 |
+
.loc 1 56 57 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:56:57
|
| 853 |
+
add.s64 %rd50, %rd48, %rd49;
|
| 854 |
+
.loc 1 56 34 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:56:34
|
| 855 |
+
mul.lo.s64 %rd51, %rd47, %rd50;
|
| 856 |
+
.loc 1 56 25 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:56:25
|
| 857 |
+
shl.b64 %rd52, %rd45, 2;
|
| 858 |
+
add.s64 %rd53, %rd12, %rd52;
|
| 859 |
+
shl.b64 %rd54, %rd51, 2;
|
| 860 |
+
add.s64 %rd40, %rd53, %rd54;
|
| 861 |
+
.loc 1 56 89 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:56:89
|
| 862 |
+
bar.sync 0;
|
| 863 |
+
st.shared.v4.b32 [%r3], {%r483, %r482, %r485, %r484};
|
| 864 |
+
bar.sync 0;
|
| 865 |
+
// begin inline asm
|
| 866 |
+
ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%r462}, [%r461];
|
| 867 |
+
// end inline asm
|
| 868 |
+
and.b32 %r486, %r2, 128;
|
| 869 |
+
setp.eq.b32 %p109, %r486, 0;
|
| 870 |
+
and.pred %p94, %p109, %p106;
|
| 871 |
+
// begin inline asm
|
| 872 |
+
@%p94 st.global.b32 [ %rd40 + 0 ], { %r462 };
|
| 873 |
+
// end inline asm
|
| 874 |
+
.loc 1 57 25 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:57:25
|
| 875 |
+
shl.b64 %rd55, %rd5, 2;
|
| 876 |
+
add.s64 %rd41, %rd10, %rd55;
|
| 877 |
+
.loc 1 57 36 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:57:36
|
| 878 |
+
// begin inline asm
|
| 879 |
+
@%p94 st.global.b32 [ %rd41 + 0 ], { %r463 };
|
| 880 |
+
// end inline asm
|
| 881 |
+
.loc 1 58 25 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:58:25
|
| 882 |
+
add.s64 %rd42, %rd11, %rd55;
|
| 883 |
+
.loc 1 58 37 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:58:37
|
| 884 |
+
// begin inline asm
|
| 885 |
+
@%p94 st.global.b32 [ %rd42 + 0 ], { %r464 };
|
| 886 |
+
// end inline asm
|
| 887 |
+
.loc 1 58 4 // c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py:58:4
|
| 888 |
+
ret;
|
| 889 |
+
$L__tmp11:
|
| 890 |
+
$L__func_end0:
|
| 891 |
+
// -- End function
|
| 892 |
+
}
|
| 893 |
+
.file 1 "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py"
|
| 894 |
+
.file 2 "/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py"
|
| 895 |
+
.file 3 "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py"
|
| 896 |
+
.section .debug_abbrev
|
| 897 |
+
{
|
| 898 |
+
.b8 1 // Abbreviation Code
|
| 899 |
+
.b8 17 // DW_TAG_compile_unit
|
| 900 |
+
.b8 1 // DW_CHILDREN_yes
|
| 901 |
+
.b8 37 // DW_AT_producer
|
| 902 |
+
.b8 8 // DW_FORM_string
|
| 903 |
+
.b8 19 // DW_AT_language
|
| 904 |
+
.b8 5 // DW_FORM_data2
|
| 905 |
+
.b8 3 // DW_AT_name
|
| 906 |
+
.b8 8 // DW_FORM_string
|
| 907 |
+
.b8 16 // DW_AT_stmt_list
|
| 908 |
+
.b8 6 // DW_FORM_data4
|
| 909 |
+
.b8 27 // DW_AT_comp_dir
|
| 910 |
+
.b8 8 // DW_FORM_string
|
| 911 |
+
.b8 0 // EOM(1)
|
| 912 |
+
.b8 0 // EOM(2)
|
| 913 |
+
.b8 2 // Abbreviation Code
|
| 914 |
+
.b8 46 // DW_TAG_subprogram
|
| 915 |
+
.b8 0 // DW_CHILDREN_no
|
| 916 |
+
.b8 3 // DW_AT_name
|
| 917 |
+
.b8 8 // DW_FORM_string
|
| 918 |
+
.b8 32 // DW_AT_inline
|
| 919 |
+
.b8 11 // DW_FORM_data1
|
| 920 |
+
.b8 0 // EOM(1)
|
| 921 |
+
.b8 0 // EOM(2)
|
| 922 |
+
.b8 3 // Abbreviation Code
|
| 923 |
+
.b8 46 // DW_TAG_subprogram
|
| 924 |
+
.b8 1 // DW_CHILDREN_yes
|
| 925 |
+
.b8 17 // DW_AT_low_pc
|
| 926 |
+
.b8 1 // DW_FORM_addr
|
| 927 |
+
.b8 18 // DW_AT_high_pc
|
| 928 |
+
.b8 1 // DW_FORM_addr
|
| 929 |
+
.b8 49 // DW_AT_abstract_origin
|
| 930 |
+
.b8 19 // DW_FORM_ref4
|
| 931 |
+
.b8 0 // EOM(1)
|
| 932 |
+
.b8 0 // EOM(2)
|
| 933 |
+
.b8 4 // Abbreviation Code
|
| 934 |
+
.b8 29 // DW_TAG_inlined_subroutine
|
| 935 |
+
.b8 0 // DW_CHILDREN_no
|
| 936 |
+
.b8 49 // DW_AT_abstract_origin
|
| 937 |
+
.b8 19 // DW_FORM_ref4
|
| 938 |
+
.b8 17 // DW_AT_low_pc
|
| 939 |
+
.b8 1 // DW_FORM_addr
|
| 940 |
+
.b8 18 // DW_AT_high_pc
|
| 941 |
+
.b8 1 // DW_FORM_addr
|
| 942 |
+
.b8 88 // DW_AT_call_file
|
| 943 |
+
.b8 11 // DW_FORM_data1
|
| 944 |
+
.b8 89 // DW_AT_call_line
|
| 945 |
+
.b8 11 // DW_FORM_data1
|
| 946 |
+
.b8 87 // DW_AT_call_column
|
| 947 |
+
.b8 11 // DW_FORM_data1
|
| 948 |
+
.b8 0 // EOM(1)
|
| 949 |
+
.b8 0 // EOM(2)
|
| 950 |
+
.b8 0 // EOM(3)
|
| 951 |
+
}
|
| 952 |
+
.section .debug_info
|
| 953 |
+
{
|
| 954 |
+
.b32 259 // Length of Unit
|
| 955 |
+
.b8 2 // DWARF version number
|
| 956 |
+
.b8 0
|
| 957 |
+
.b32 .debug_abbrev // Offset Into Abbrev. Section
|
| 958 |
+
.b8 8 // Address Size (in bytes)
|
| 959 |
+
.b8 1 // Abbrev [1] 0xb:0xfc DW_TAG_compile_unit
|
| 960 |
+
.b8 116 // DW_AT_producer
|
| 961 |
+
.b8 114
|
| 962 |
+
.b8 105
|
| 963 |
+
.b8 116
|
| 964 |
+
.b8 111
|
| 965 |
+
.b8 110
|
| 966 |
+
.b8 0
|
| 967 |
+
.b8 2 // DW_AT_language
|
| 968 |
+
.b8 0
|
| 969 |
+
.b8 99 // DW_AT_name
|
| 970 |
+
.b8 53
|
| 971 |
+
.b8 99
|
| 972 |
+
.b8 120
|
| 973 |
+
.b8 115
|
| 974 |
+
.b8 107
|
| 975 |
+
.b8 118
|
| 976 |
+
.b8 108
|
| 977 |
+
.b8 109
|
| 978 |
+
.b8 99
|
| 979 |
+
.b8 52
|
| 980 |
+
.b8 104
|
| 981 |
+
.b8 113
|
| 982 |
+
.b8 54
|
| 983 |
+
.b8 51
|
| 984 |
+
.b8 107
|
| 985 |
+
.b8 109
|
| 986 |
+
.b8 107
|
| 987 |
+
.b8 99
|
| 988 |
+
.b8 101
|
| 989 |
+
.b8 120
|
| 990 |
+
.b8 117
|
| 991 |
+
.b8 107
|
| 992 |
+
.b8 119
|
| 993 |
+
.b8 117
|
| 994 |
+
.b8 104
|
| 995 |
+
.b8 105
|
| 996 |
+
.b8 118
|
| 997 |
+
.b8 117
|
| 998 |
+
.b8 101
|
| 999 |
+
.b8 100
|
| 1000 |
+
.b8 101
|
| 1001 |
+
.b8 55
|
| 1002 |
+
.b8 114
|
| 1003 |
+
.b8 55
|
| 1004 |
+
.b8 116
|
| 1005 |
+
.b8 122
|
| 1006 |
+
.b8 105
|
| 1007 |
+
.b8 100
|
| 1008 |
+
.b8 112
|
| 1009 |
+
.b8 122
|
| 1010 |
+
.b8 111
|
| 1011 |
+
.b8 53
|
| 1012 |
+
.b8 108
|
| 1013 |
+
.b8 108
|
| 1014 |
+
.b8 119
|
| 1015 |
+
.b8 100
|
| 1016 |
+
.b8 101
|
| 1017 |
+
.b8 100
|
| 1018 |
+
.b8 101
|
| 1019 |
+
.b8 101
|
| 1020 |
+
.b8 109
|
| 1021 |
+
.b8 46
|
| 1022 |
+
.b8 112
|
| 1023 |
+
.b8 121
|
| 1024 |
+
.b8 0
|
| 1025 |
+
.b32 .debug_line // DW_AT_stmt_list
|
| 1026 |
+
.b8 47 // DW_AT_comp_dir
|
| 1027 |
+
.b8 119
|
| 1028 |
+
.b8 111
|
| 1029 |
+
.b8 114
|
| 1030 |
+
.b8 107
|
| 1031 |
+
.b8 115
|
| 1032 |
+
.b8 112
|
| 1033 |
+
.b8 97
|
| 1034 |
+
.b8 99
|
| 1035 |
+
.b8 101
|
| 1036 |
+
.b8 47
|
| 1037 |
+
.b8 104
|
| 1038 |
+
.b8 97
|
| 1039 |
+
.b8 110
|
| 1040 |
+
.b8 114
|
| 1041 |
+
.b8 117
|
| 1042 |
+
.b8 105
|
| 1043 |
+
.b8 47
|
| 1044 |
+
.b8 106
|
| 1045 |
+
.b8 117
|
| 1046 |
+
.b8 110
|
| 1047 |
+
.b8 113
|
| 1048 |
+
.b8 117
|
| 1049 |
+
.b8 97
|
| 1050 |
+
.b8 110
|
| 1051 |
+
.b8 47
|
| 1052 |
+
.b8 83
|
| 1053 |
+
.b8 112
|
| 1054 |
+
.b8 101
|
| 1055 |
+
.b8 99
|
| 1056 |
+
.b8 70
|
| 1057 |
+
.b8 111
|
| 1058 |
+
.b8 114
|
| 1059 |
+
.b8 103
|
| 1060 |
+
.b8 101
|
| 1061 |
+
.b8 47
|
| 1062 |
+
.b8 99
|
| 1063 |
+
.b8 97
|
| 1064 |
+
.b8 99
|
| 1065 |
+
.b8 104
|
| 1066 |
+
.b8 101
|
| 1067 |
+
.b8 47
|
| 1068 |
+
.b8 99
|
| 1069 |
+
.b8 111
|
| 1070 |
+
.b8 109
|
| 1071 |
+
.b8 112
|
| 1072 |
+
.b8 105
|
| 1073 |
+
.b8 108
|
| 1074 |
+
.b8 101
|
| 1075 |
+
.b8 100
|
| 1076 |
+
.b8 95
|
| 1077 |
+
.b8 107
|
| 1078 |
+
.b8 101
|
| 1079 |
+
.b8 114
|
| 1080 |
+
.b8 110
|
| 1081 |
+
.b8 101
|
| 1082 |
+
.b8 108
|
| 1083 |
+
.b8 115
|
| 1084 |
+
.b8 47
|
| 1085 |
+
.b8 53
|
| 1086 |
+
.b8 99
|
| 1087 |
+
.b8 0
|
| 1088 |
+
.b8 2 // Abbrev [2] 0x8f:0x19 DW_TAG_subprogram
|
| 1089 |
+
.b8 116 // DW_AT_name
|
| 1090 |
+
.b8 114
|
| 1091 |
+
.b8 105
|
| 1092 |
+
.b8 116
|
| 1093 |
+
.b8 111
|
| 1094 |
+
.b8 110
|
| 1095 |
+
.b8 95
|
| 1096 |
+
.b8 112
|
| 1097 |
+
.b8 101
|
| 1098 |
+
.b8 114
|
| 1099 |
+
.b8 95
|
| 1100 |
+
.b8 102
|
| 1101 |
+
.b8 117
|
| 1102 |
+
.b8 115
|
| 1103 |
+
.b8 101
|
| 1104 |
+
.b8 100
|
| 1105 |
+
.b8 95
|
| 1106 |
+
.b8 109
|
| 1107 |
+
.b8 117
|
| 1108 |
+
.b8 108
|
| 1109 |
+
.b8 95
|
| 1110 |
+
.b8 49
|
| 1111 |
+
.b8 0
|
| 1112 |
+
.b8 1 // DW_AT_inline
|
| 1113 |
+
.b8 3 // Abbrev [3] 0xa8:0x5e DW_TAG_subprogram
|
| 1114 |
+
.b64 $L__func_begin0 // DW_AT_low_pc
|
| 1115 |
+
.b64 $L__func_end0 // DW_AT_high_pc
|
| 1116 |
+
.b32 143 // DW_AT_abstract_origin
|
| 1117 |
+
.b8 4 // Abbrev [4] 0xbd:0x18 DW_TAG_inlined_subroutine
|
| 1118 |
+
.b32 143 // DW_AT_abstract_origin
|
| 1119 |
+
.b64 $L__tmp1 // DW_AT_low_pc
|
| 1120 |
+
.b64 $L__tmp2 // DW_AT_high_pc
|
| 1121 |
+
.b8 1 // DW_AT_call_file
|
| 1122 |
+
.b8 39 // DW_AT_call_line
|
| 1123 |
+
.b8 37 // DW_AT_call_column
|
| 1124 |
+
.b8 4 // Abbrev [4] 0xd5:0x18 DW_TAG_inlined_subroutine
|
| 1125 |
+
.b32 143 // DW_AT_abstract_origin
|
| 1126 |
+
.b64 $L__tmp3 // DW_AT_low_pc
|
| 1127 |
+
.b64 $L__tmp4 // DW_AT_high_pc
|
| 1128 |
+
.b8 1 // DW_AT_call_file
|
| 1129 |
+
.b8 49 // DW_AT_call_line
|
| 1130 |
+
.b8 26 // DW_AT_call_column
|
| 1131 |
+
.b8 4 // Abbrev [4] 0xed:0x18 DW_TAG_inlined_subroutine
|
| 1132 |
+
.b32 143 // DW_AT_abstract_origin
|
| 1133 |
+
.b64 $L__tmp5 // DW_AT_low_pc
|
| 1134 |
+
.b64 $L__tmp10 // DW_AT_high_pc
|
| 1135 |
+
.b8 1 // DW_AT_call_file
|
| 1136 |
+
.b8 34 // DW_AT_call_line
|
| 1137 |
+
.b8 51 // DW_AT_call_column
|
| 1138 |
+
.b8 0 // End Of Children Mark
|
| 1139 |
+
.b8 0 // End Of Children Mark
|
| 1140 |
+
}
|
| 1141 |
+
.section .debug_macinfo { }
|
progress/SpecForge/cache/compiled_kernels/triton/2/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.source
ADDED
|
@@ -0,0 +1,391 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":18:0)
|
| 2 |
+
#loc57 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":69:0)
|
| 3 |
+
#loc69 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":122:0)
|
| 4 |
+
#loc71 = loc(unknown)
|
| 5 |
+
#loc74 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":109:0)
|
| 6 |
+
#loc83 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0)
|
| 7 |
+
#loc87 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0)
|
| 8 |
+
#loc96 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0)
|
| 9 |
+
#loc100 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0)
|
| 10 |
+
#loc104 = loc("in_ptr0"(#loc))
|
| 11 |
+
#loc105 = loc("in_ptr1"(#loc))
|
| 12 |
+
#loc106 = loc("out_ptr0"(#loc))
|
| 13 |
+
#loc107 = loc("out_ptr1"(#loc))
|
| 14 |
+
#loc108 = loc("out_ptr2"(#loc))
|
| 15 |
+
#loc109 = loc("ks0"(#loc))
|
| 16 |
+
#loc110 = loc("xnumel"(#loc))
|
| 17 |
+
#loc111 = loc("r0_numel"(#loc))
|
| 18 |
+
#loc154 = loc("a"(#loc57))
|
| 19 |
+
#loc155 = loc("b"(#loc57))
|
| 20 |
+
#loc161 = loc("a"(#loc69))
|
| 21 |
+
#loc162 = loc("a"(#loc74))
|
| 22 |
+
#loc163 = loc("b"(#loc74))
|
| 23 |
+
#loc167 = loc("x"(#loc83))
|
| 24 |
+
#loc168 = loc("x"(#loc87))
|
| 25 |
+
#loc169 = loc("input"(#loc96))
|
| 26 |
+
#loc170 = loc("a"(#loc100))
|
| 27 |
+
#loc171 = loc("b"(#loc100))
|
| 28 |
+
module {
|
| 29 |
+
tt.func public @triton_per_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 30 |
+
%r0_numel_0 = arith.constant 32 : i32 loc(#loc112)
|
| 31 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc113)
|
| 32 |
+
%xoffset_1 = arith.constant 128 : i32 loc(#loc114)
|
| 33 |
+
%xoffset_2 = arith.constant 128 : i32 loc(#loc114)
|
| 34 |
+
%xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc114)
|
| 35 |
+
%xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc115)
|
| 36 |
+
%xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc116)
|
| 37 |
+
%xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<128x1xi32> loc(#loc117)
|
| 38 |
+
%xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<128x1xi32> loc(#loc117)
|
| 39 |
+
%xmask = tt.splat %xnumel : i32 -> tensor<128x1xi32> loc(#loc118)
|
| 40 |
+
%xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<128x1xi32> loc(#loc118)
|
| 41 |
+
%r0_index = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc119)
|
| 42 |
+
%r0_index_8 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc120)
|
| 43 |
+
%r0_offset = arith.constant 0 : i32 loc(#loc121)
|
| 44 |
+
%r0_mask = arith.constant true loc(#loc122)
|
| 45 |
+
%r0_mask_9 = arith.constant dense<true> : tensor<128x32xi1> loc(#loc122)
|
| 46 |
+
%x2 = arith.extsi %xindex_6 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc123)
|
| 47 |
+
%x2_10 = tt.splat %ks0 : i64 -> tensor<128x1xi64> loc(#loc123)
|
| 48 |
+
%x2_11 = arith.remsi %x2, %x2_10 : tensor<128x1xi64> loc(#loc123)
|
| 49 |
+
%x3 = tt.call @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S128_1S_i64__(%xindex_6, %ks0) : (tensor<128x1xi32>, i64) -> tensor<128x1xi64> loc(#loc124)
|
| 50 |
+
%tmp0 = arith.constant 32 : i32 loc(#loc125)
|
| 51 |
+
%tmp0_12 = arith.constant 32 : i64 loc(#loc125)
|
| 52 |
+
%tmp0_13 = arith.muli %tmp0_12, %ks0 : i64 loc(#loc125)
|
| 53 |
+
%tmp0_14 = arith.extsi %r0_index_8 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc126)
|
| 54 |
+
%tmp0_15 = tt.splat %tmp0_13 : i64 -> tensor<1x32xi64> loc(#loc126)
|
| 55 |
+
%tmp0_16 = arith.muli %tmp0_15, %tmp0_14 : tensor<1x32xi64> loc(#loc126)
|
| 56 |
+
%tmp0_17 = arith.extsi %xindex_6 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc127)
|
| 57 |
+
%tmp0_18 = tt.broadcast %tmp0_17 : tensor<128x1xi64> -> tensor<128x32xi64> loc(#loc127)
|
| 58 |
+
%tmp0_19 = tt.broadcast %tmp0_16 : tensor<1x32xi64> -> tensor<128x32xi64> loc(#loc127)
|
| 59 |
+
%tmp0_20 = arith.addi %tmp0_18, %tmp0_19 : tensor<128x32xi64> loc(#loc127)
|
| 60 |
+
%tmp0_21 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x32x!tt.ptr<f32>> loc(#loc128)
|
| 61 |
+
%tmp0_22 = tt.addptr %tmp0_21, %tmp0_20 : tensor<128x32x!tt.ptr<f32>>, tensor<128x32xi64> loc(#loc128)
|
| 62 |
+
%tmp0_23 = arith.constant 0.000000e+00 : f32 loc(#loc129)
|
| 63 |
+
%tmp0_24 = tt.broadcast %xmask_7 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc129)
|
| 64 |
+
%tmp0_25 = arith.constant dense<0.000000e+00> : tensor<128x32xf32> loc(#loc129)
|
| 65 |
+
%tmp0_26 = tt.load %tmp0_22, %tmp0_24, %tmp0_25 : tensor<128x32x!tt.ptr<f32>> loc(#loc129)
|
| 66 |
+
%tmp5 = arith.constant 32 : i32 loc(#loc130)
|
| 67 |
+
%tmp5_27 = arith.constant 32 : i64 loc(#loc130)
|
| 68 |
+
%tmp5_28 = arith.muli %tmp5_27, %ks0 : i64 loc(#loc130)
|
| 69 |
+
%tmp5_29 = arith.extsi %r0_index_8 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc131)
|
| 70 |
+
%tmp5_30 = tt.splat %tmp5_28 : i64 -> tensor<1x32xi64> loc(#loc131)
|
| 71 |
+
%tmp5_31 = arith.muli %tmp5_30, %tmp5_29 : tensor<1x32xi64> loc(#loc131)
|
| 72 |
+
%tmp5_32 = arith.extsi %xindex_6 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc132)
|
| 73 |
+
%tmp5_33 = tt.broadcast %tmp5_32 : tensor<128x1xi64> -> tensor<128x32xi64> loc(#loc132)
|
| 74 |
+
%tmp5_34 = tt.broadcast %tmp5_31 : tensor<1x32xi64> -> tensor<128x32xi64> loc(#loc132)
|
| 75 |
+
%tmp5_35 = arith.addi %tmp5_33, %tmp5_34 : tensor<128x32xi64> loc(#loc132)
|
| 76 |
+
%tmp5_36 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<128x32x!tt.ptr<f32>> loc(#loc133)
|
| 77 |
+
%tmp5_37 = tt.addptr %tmp5_36, %tmp5_35 : tensor<128x32x!tt.ptr<f32>>, tensor<128x32xi64> loc(#loc133)
|
| 78 |
+
%tmp5_38 = arith.constant 0.000000e+00 : f32 loc(#loc134)
|
| 79 |
+
%tmp5_39 = tt.broadcast %xmask_7 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc134)
|
| 80 |
+
%tmp5_40 = arith.constant dense<0.000000e+00> : tensor<128x32xf32> loc(#loc134)
|
| 81 |
+
%tmp5_41 = tt.load %tmp5_37, %tmp5_39, %tmp5_40 : tensor<128x32x!tt.ptr<f32>> loc(#loc134)
|
| 82 |
+
%tmp3 = arith.constant 0xFF800000 : f32 loc(#loc135)
|
| 83 |
+
%tmp3_42 = arith.constant 0xFF800000 : f32 loc(#loc135)
|
| 84 |
+
%tmp3_43 = arith.constant dense<0xFF800000> : tensor<128x32xf32> loc(#loc135)
|
| 85 |
+
%tmp3_44 = tt.broadcast %xmask_7 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc135)
|
| 86 |
+
%tmp3_45 = arith.select %tmp3_44, %tmp0_26, %tmp3_43 : tensor<128x32xi1>, tensor<128x32xf32> loc(#loc135)
|
| 87 |
+
%tmp4 = tt.call @"torch._inductor.runtime.triton_helpers.max2__fp32S128_32S__(1,)cconstexpr_1_"(%tmp3_45) : (tensor<128x32xf32>) -> tensor<128xf32> loc(#loc136)
|
| 88 |
+
%tmp4_46 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc137)
|
| 89 |
+
%tmp6 = arith.constant 0xFF800000 : f32 loc(#loc138)
|
| 90 |
+
%tmp7 = arith.constant dense<0xFF800000> : tensor<128x1xf32> loc(#loc139)
|
| 91 |
+
%tmp7_47 = arith.cmpf oeq, %tmp4_46, %tmp7 : tensor<128x1xf32> loc(#loc139)
|
| 92 |
+
%tmp8 = tt.broadcast %tmp4_46 : tensor<128x1xf32> -> tensor<128x32xf32> loc(#loc140)
|
| 93 |
+
%tmp8_48 = arith.subf %tmp0_26, %tmp8 : tensor<128x32xf32> loc(#loc140)
|
| 94 |
+
%tmp9 = arith.constant 0.000000e+00 : f32 loc(#loc141)
|
| 95 |
+
%tmp10 = arith.constant dense<0.000000e+00> : tensor<128x32xf32> loc(#loc142)
|
| 96 |
+
%tmp10_49 = tt.broadcast %tmp7_47 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc142)
|
| 97 |
+
%tmp10_50 = arith.select %tmp10_49, %tmp10, %tmp8_48 : tensor<128x32xi1>, tensor<128x32xf32> loc(#loc142)
|
| 98 |
+
%tmp11 = tt.extern_elementwise %tmp10_50 {libname = "", libpath = "", pure = true, symbol = "__nv_exp2f"} : (tensor<128x32xf32>) -> tensor<128x32xf32> loc(#loc143)
|
| 99 |
+
%tmp12 = arith.mulf %tmp5_41, %tmp11 : tensor<128x32xf32> loc(#loc144)
|
| 100 |
+
%tmp15 = arith.constant 0 : i32 loc(#loc145)
|
| 101 |
+
%tmp15_51 = arith.constant 0.000000e+00 : f32 loc(#loc145)
|
| 102 |
+
%tmp15_52 = arith.constant dense<0.000000e+00> : tensor<128x32xf32> loc(#loc145)
|
| 103 |
+
%tmp15_53 = tt.broadcast %xmask_7 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc145)
|
| 104 |
+
%tmp15_54 = arith.select %tmp15_53, %tmp12, %tmp15_52 : tensor<128x32xi1>, tensor<128x32xf32> loc(#loc145)
|
| 105 |
+
%tmp16 = tt.call @"triton.language.standard.sum__fp32S128_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp15_54) : (tensor<128x32xf32>) -> tensor<128xf32> loc(#loc146)
|
| 106 |
+
%tmp16_55 = tt.expand_dims %tmp16 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc147)
|
| 107 |
+
%tmp17 = arith.constant 1.000000e+00 : f32 loc(#loc148)
|
| 108 |
+
%tmp18 = arith.constant dense<1.000000e+00> : tensor<128x1xf32> loc(#loc149)
|
| 109 |
+
%tmp18_56 = arith.select %tmp7_47, %tmp18, %tmp16_55 : tensor<128x1xi1>, tensor<128x1xf32> loc(#loc149)
|
| 110 |
+
%tmp19 = tt.extern_elementwise %tmp18_56 {libname = "", libpath = "", pure = true, symbol = "__nv_log2f"} : (tensor<128x1xf32>) -> tensor<128x1xf32> loc(#loc150)
|
| 111 |
+
%tmp20 = arith.addf %tmp19, %tmp4_46 : tensor<128x1xf32> loc(#loc151)
|
| 112 |
+
%tmp21 = arith.constant 0.693147182 : f32 loc(#loc152)
|
| 113 |
+
%tmp22 = arith.constant dense<0.693147182> : tensor<128x1xf32> loc(#loc153)
|
| 114 |
+
%tmp22_57 = arith.mulf %tmp20, %tmp22 : tensor<128x1xf32> loc(#loc153)
|
| 115 |
+
%c1_i32 = arith.constant 1 : i32 loc(#loc43)
|
| 116 |
+
%0 = arith.extsi %c1_i32 : i32 to i64 loc(#loc43)
|
| 117 |
+
%1 = arith.cmpi sge, %0, %ks0 : i64 loc(#loc43)
|
| 118 |
+
%c1_i32_58 = arith.constant 1 : i32 loc(#loc44)
|
| 119 |
+
%c1_i32_59 = arith.constant 1 : i32 loc(#loc44)
|
| 120 |
+
%2 = arith.extui %1 : i1 to i32 loc(#loc44)
|
| 121 |
+
%3 = arith.muli %c1_i32_59, %2 : i32 loc(#loc44)
|
| 122 |
+
%c1_i32_60 = arith.constant 1 : i32 loc(#loc45)
|
| 123 |
+
%4 = arith.extsi %c1_i32_60 : i32 to i64 loc(#loc45)
|
| 124 |
+
%5 = arith.cmpi sgt, %ks0, %4 : i64 loc(#loc45)
|
| 125 |
+
%6 = arith.extui %5 : i1 to i64 loc(#loc46)
|
| 126 |
+
%7 = arith.muli %ks0, %6 : i64 loc(#loc46)
|
| 127 |
+
%8 = arith.extsi %3 : i32 to i64 loc(#loc47)
|
| 128 |
+
%9 = arith.addi %8, %7 : i64 loc(#loc47)
|
| 129 |
+
%10 = tt.splat %9 : i64 -> tensor<128x1xi64> loc(#loc48)
|
| 130 |
+
%11 = arith.muli %x3, %10 : tensor<128x1xi64> loc(#loc48)
|
| 131 |
+
%12 = arith.addi %x2_11, %11 : tensor<128x1xi64> loc(#loc49)
|
| 132 |
+
%13 = tt.splat %out_ptr2 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>> loc(#loc50)
|
| 133 |
+
%14 = tt.addptr %13, %12 : tensor<128x1x!tt.ptr<f32>>, tensor<128x1xi64> loc(#loc50)
|
| 134 |
+
tt.store %14, %tmp22_57, %xmask_7 : tensor<128x1x!tt.ptr<f32>> loc(#loc51)
|
| 135 |
+
%15 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>> loc(#loc52)
|
| 136 |
+
%16 = tt.addptr %15, %xindex_6 : tensor<128x1x!tt.ptr<f32>>, tensor<128x1xi32> loc(#loc52)
|
| 137 |
+
tt.store %16, %tmp4_46, %xmask_7 : tensor<128x1x!tt.ptr<f32>> loc(#loc53)
|
| 138 |
+
%17 = tt.splat %out_ptr1 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>> loc(#loc54)
|
| 139 |
+
%18 = tt.addptr %17, %xindex_6 : tensor<128x1x!tt.ptr<f32>>, tensor<128x1xi32> loc(#loc54)
|
| 140 |
+
tt.store %18, %tmp16_55, %xmask_7 : tensor<128x1x!tt.ptr<f32>> loc(#loc55)
|
| 141 |
+
tt.return loc(#loc56)
|
| 142 |
+
} loc(#loc)
|
| 143 |
+
tt.func private @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S128_1S_i64__(%a: tensor<128x1xi32> loc("a"(#loc57)), %b: i64 loc("b"(#loc57))) -> tensor<128x1xi64> attributes {noinline = false} {
|
| 144 |
+
%quot = arith.extsi %a : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc156)
|
| 145 |
+
%quot_0 = tt.splat %b : i64 -> tensor<128x1xi64> loc(#loc156)
|
| 146 |
+
%quot_1 = arith.divsi %quot, %quot_0 : tensor<128x1xi64> loc(#loc156)
|
| 147 |
+
%remainder = arith.extsi %a : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc157)
|
| 148 |
+
%remainder_2 = tt.splat %b : i64 -> tensor<128x1xi64> loc(#loc157)
|
| 149 |
+
%remainder_3 = arith.remsi %remainder, %remainder_2 : tensor<128x1xi64> loc(#loc157)
|
| 150 |
+
%fixed = arith.constant 0 : i32 loc(#loc158)
|
| 151 |
+
%fixed_4 = arith.extsi %fixed : i32 to i64 loc(#loc158)
|
| 152 |
+
%fixed_5 = tt.splat %fixed_4 : i64 -> tensor<128x1xi64> loc(#loc158)
|
| 153 |
+
%fixed_6 = arith.cmpi ne, %remainder_3, %fixed_5 : tensor<128x1xi64> loc(#loc158)
|
| 154 |
+
%fixed_7 = arith.constant 1 : i32 loc(#loc159)
|
| 155 |
+
%fixed_8 = arith.constant 1 : i64 loc(#loc159)
|
| 156 |
+
%fixed_9 = arith.constant dense<1> : tensor<128x1xi64> loc(#loc159)
|
| 157 |
+
%fixed_10 = arith.subi %quot_1, %fixed_9 : tensor<128x1xi64> loc(#loc159)
|
| 158 |
+
%fixed_11 = arith.select %fixed_6, %fixed_10, %quot_1 : tensor<128x1xi1>, tensor<128x1xi64> loc(#loc160)
|
| 159 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc63)
|
| 160 |
+
%cst = arith.constant dense<0> : tensor<128x1xi32> loc(#loc63)
|
| 161 |
+
%0 = arith.cmpi slt, %a, %cst : tensor<128x1xi32> loc(#loc63)
|
| 162 |
+
%c0_i32_12 = arith.constant 0 : i32 loc(#loc64)
|
| 163 |
+
%1 = arith.extsi %c0_i32_12 : i32 to i64 loc(#loc64)
|
| 164 |
+
%2 = arith.cmpi slt, %b, %1 : i64 loc(#loc64)
|
| 165 |
+
%3 = tt.splat %2 : i1 -> tensor<128x1xi1> loc(#loc65)
|
| 166 |
+
%4 = arith.cmpi ne, %0, %3 : tensor<128x1xi1> loc(#loc65)
|
| 167 |
+
%5 = arith.select %4, %fixed_11, %quot_1 : tensor<128x1xi1>, tensor<128x1xi64> loc(#loc66)
|
| 168 |
+
tt.return %5 : tensor<128x1xi64> loc(#loc67)
|
| 169 |
+
^bb1: // no predecessors
|
| 170 |
+
%6 = ub.poison : tensor<128x1xi64> loc(#loc68)
|
| 171 |
+
tt.return %6 : tensor<128x1xi64> loc(#loc68)
|
| 172 |
+
} loc(#loc57)
|
| 173 |
+
tt.func private @"torch._inductor.runtime.triton_helpers.max2__fp32S128_32S__(1,)cconstexpr_1_"(%a: tensor<128x32xf32> loc("a"(#loc69))) -> tensor<128xf32> attributes {noinline = false} {
|
| 174 |
+
%0 = "tt.reduce"(%a) <{axis = 1 : i32}> ({
|
| 175 |
+
^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
|
| 176 |
+
%2 = tt.call @torch._inductor.runtime.triton_helpers.maximum__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc70)
|
| 177 |
+
tt.reduce.return %2 : f32 loc(#loc70)
|
| 178 |
+
}) : (tensor<128x32xf32>) -> tensor<128xf32> loc(#loc70)
|
| 179 |
+
tt.return %0 : tensor<128xf32> loc(#loc72)
|
| 180 |
+
^bb1: // no predecessors
|
| 181 |
+
%1 = ub.poison : tensor<128xf32> loc(#loc73)
|
| 182 |
+
tt.return %1 : tensor<128xf32> loc(#loc73)
|
| 183 |
+
} loc(#loc69)
|
| 184 |
+
tt.func private @torch._inductor.runtime.triton_helpers.maximum__fp32_fp32__(%a: f32 loc("a"(#loc74)), %b: f32 loc("b"(#loc74))) -> f32 attributes {noinline = false} {
|
| 185 |
+
%mask = arith.cmpf ogt, %a, %b : f32 loc(#loc172)
|
| 186 |
+
%0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a) : (f32) -> i1 loc(#loc76)
|
| 187 |
+
%1 = scf.if %0 -> (i1) {
|
| 188 |
+
%mask_0 = arith.cmpf une, %a, %a : f32 loc(#loc165)
|
| 189 |
+
%mask_1 = arith.ori %mask, %mask_0 : i1 loc(#loc173)
|
| 190 |
+
scf.yield %mask_1 : i1 loc(#loc173)
|
| 191 |
+
} else {
|
| 192 |
+
scf.yield %mask : i1 loc(#loc71)
|
| 193 |
+
} loc(#loc77)
|
| 194 |
+
%2 = arith.select %1, %a, %b : f32 loc(#loc80)
|
| 195 |
+
tt.return %2 : f32 loc(#loc81)
|
| 196 |
+
^bb1: // no predecessors
|
| 197 |
+
%3 = ub.poison : f32 loc(#loc82)
|
| 198 |
+
tt.return %3 : f32 loc(#loc82)
|
| 199 |
+
} loc(#loc74)
|
| 200 |
+
tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc83))) -> i1 attributes {noinline = false} {
|
| 201 |
+
%0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc84)
|
| 202 |
+
%true = arith.constant true loc(#loc85)
|
| 203 |
+
tt.return %true : i1 loc(#loc85)
|
| 204 |
+
^bb1: // no predecessors
|
| 205 |
+
%1 = ub.poison : i1 loc(#loc86)
|
| 206 |
+
tt.return %1 : i1 loc(#loc86)
|
| 207 |
+
} loc(#loc83)
|
| 208 |
+
tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc87))) -> tensor<1xf32> attributes {noinline = false} {
|
| 209 |
+
%0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc88)
|
| 210 |
+
%1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc89)
|
| 211 |
+
%2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc89)
|
| 212 |
+
%3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc89)
|
| 213 |
+
tt.return %3 : tensor<1xf32> loc(#loc90)
|
| 214 |
+
^bb1: // no predecessors
|
| 215 |
+
%4 = ub.poison : tensor<1xf32> loc(#loc91)
|
| 216 |
+
tt.return %4 : tensor<1xf32> loc(#loc91)
|
| 217 |
+
} loc(#loc87)
|
| 218 |
+
tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} {
|
| 219 |
+
%false = arith.constant false loc(#loc93)
|
| 220 |
+
%cst = arith.constant dense<false> : tensor<1xi1> loc(#loc93)
|
| 221 |
+
tt.return %cst : tensor<1xi1> loc(#loc94)
|
| 222 |
+
^bb1: // no predecessors
|
| 223 |
+
%0 = ub.poison : tensor<1xi1> loc(#loc95)
|
| 224 |
+
tt.return %0 : tensor<1xi1> loc(#loc95)
|
| 225 |
+
} loc(#loc92)
|
| 226 |
+
tt.func private @"triton.language.standard.sum__fp32S128_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x32xf32> loc("input"(#loc96))) -> tensor<128xf32> attributes {noinline = false} {
|
| 227 |
+
%0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
|
| 228 |
+
^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
|
| 229 |
+
%2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc97)
|
| 230 |
+
tt.reduce.return %2 : f32 loc(#loc97)
|
| 231 |
+
}) : (tensor<128x32xf32>) -> tensor<128xf32> loc(#loc97)
|
| 232 |
+
tt.return %0 : tensor<128xf32> loc(#loc98)
|
| 233 |
+
^bb1: // no predecessors
|
| 234 |
+
%1 = ub.poison : tensor<128xf32> loc(#loc99)
|
| 235 |
+
tt.return %1 : tensor<128xf32> loc(#loc99)
|
| 236 |
+
} loc(#loc96)
|
| 237 |
+
tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc100)), %b: f32 loc("b"(#loc100))) -> f32 attributes {noinline = false} {
|
| 238 |
+
%0 = arith.addf %a, %b : f32 loc(#loc101)
|
| 239 |
+
tt.return %0 : f32 loc(#loc102)
|
| 240 |
+
^bb1: // no predecessors
|
| 241 |
+
%1 = ub.poison : f32 loc(#loc103)
|
| 242 |
+
tt.return %1 : f32 loc(#loc103)
|
| 243 |
+
} loc(#loc100)
|
| 244 |
+
} loc(#loc)
|
| 245 |
+
#loc1 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":19:15)
|
| 246 |
+
#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":23:28)
|
| 247 |
+
#loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":23:33)
|
| 248 |
+
#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":24:36)
|
| 249 |
+
#loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":24:44)
|
| 250 |
+
#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":24:23)
|
| 251 |
+
#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":25:21)
|
| 252 |
+
#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":26:28)
|
| 253 |
+
#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":26:38)
|
| 254 |
+
#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":27:16)
|
| 255 |
+
#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":28:48)
|
| 256 |
+
#loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":33:19)
|
| 257 |
+
#loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":34:51)
|
| 258 |
+
#loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":35:38)
|
| 259 |
+
#loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":35:42)
|
| 260 |
+
#loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":35:35)
|
| 261 |
+
#loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":35:30)
|
| 262 |
+
#loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":35:49)
|
| 263 |
+
#loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":36:38)
|
| 264 |
+
#loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":36:42)
|
| 265 |
+
#loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":36:35)
|
| 266 |
+
#loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":36:30)
|
| 267 |
+
#loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":36:49)
|
| 268 |
+
#loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":38:33)
|
| 269 |
+
#loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":39:37)
|
| 270 |
+
#loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":39:40)
|
| 271 |
+
#loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":40:11)
|
| 272 |
+
#loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":41:19)
|
| 273 |
+
#loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":42:18)
|
| 274 |
+
#loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":43:11)
|
| 275 |
+
#loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":44:33)
|
| 276 |
+
#loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":45:27)
|
| 277 |
+
#loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":46:19)
|
| 278 |
+
#loc34 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":48:35)
|
| 279 |
+
#loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":49:26)
|
| 280 |
+
#loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":49:29)
|
| 281 |
+
#loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":50:12)
|
| 282 |
+
#loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":51:34)
|
| 283 |
+
#loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":52:27)
|
| 284 |
+
#loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":53:20)
|
| 285 |
+
#loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":54:12)
|
| 286 |
+
#loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":55:20)
|
| 287 |
+
#loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:49)
|
| 288 |
+
#loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:41)
|
| 289 |
+
#loc45 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:75)
|
| 290 |
+
#loc46 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:66)
|
| 291 |
+
#loc47 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:57)
|
| 292 |
+
#loc48 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:34)
|
| 293 |
+
#loc49 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:30)
|
| 294 |
+
#loc50 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:25)
|
| 295 |
+
#loc51 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:89)
|
| 296 |
+
#loc52 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":57:25)
|
| 297 |
+
#loc53 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":57:36)
|
| 298 |
+
#loc54 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":58:25)
|
| 299 |
+
#loc55 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":58:37)
|
| 300 |
+
#loc56 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":58:4)
|
| 301 |
+
#loc58 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
|
| 302 |
+
#loc59 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20)
|
| 303 |
+
#loc60 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
|
| 304 |
+
#loc61 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
|
| 305 |
+
#loc62 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
|
| 306 |
+
#loc63 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
|
| 307 |
+
#loc64 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
|
| 308 |
+
#loc65 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
|
| 309 |
+
#loc66 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
|
| 310 |
+
#loc67 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:11)
|
| 311 |
+
#loc68 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:4)
|
| 312 |
+
#loc70 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29)
|
| 313 |
+
#loc72 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:11)
|
| 314 |
+
#loc73 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:4)
|
| 315 |
+
#loc75 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15)
|
| 316 |
+
#loc76 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:19)
|
| 317 |
+
#loc77 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:7)
|
| 318 |
+
#loc78 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21)
|
| 319 |
+
#loc79 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16)
|
| 320 |
+
#loc80 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29)
|
| 321 |
+
#loc81 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:11)
|
| 322 |
+
#loc82 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:4)
|
| 323 |
+
#loc84 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29)
|
| 324 |
+
#loc85 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11)
|
| 325 |
+
#loc86 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4)
|
| 326 |
+
#loc88 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30)
|
| 327 |
+
#loc89 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15)
|
| 328 |
+
#loc90 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11)
|
| 329 |
+
#loc91 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4)
|
| 330 |
+
#loc92 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0)
|
| 331 |
+
#loc93 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31)
|
| 332 |
+
#loc94 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11)
|
| 333 |
+
#loc95 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4)
|
| 334 |
+
#loc97 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 335 |
+
#loc98 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11)
|
| 336 |
+
#loc99 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4)
|
| 337 |
+
#loc101 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 338 |
+
#loc102 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11)
|
| 339 |
+
#loc103 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4)
|
| 340 |
+
#loc112 = loc("r0_numel"(#loc1))
|
| 341 |
+
#loc113 = loc("xoffset"(#loc2))
|
| 342 |
+
#loc114 = loc("xoffset"(#loc3))
|
| 343 |
+
#loc115 = loc("xindex"(#loc4))
|
| 344 |
+
#loc116 = loc("xindex"(#loc5))
|
| 345 |
+
#loc117 = loc("xindex"(#loc6))
|
| 346 |
+
#loc118 = loc("xmask"(#loc7))
|
| 347 |
+
#loc119 = loc("r0_index"(#loc8))
|
| 348 |
+
#loc120 = loc("r0_index"(#loc9))
|
| 349 |
+
#loc121 = loc("r0_offset"(#loc10))
|
| 350 |
+
#loc122 = loc("r0_mask"(#loc11))
|
| 351 |
+
#loc123 = loc("x2"(#loc12))
|
| 352 |
+
#loc124 = loc("x3"(#loc13))
|
| 353 |
+
#loc125 = loc("tmp0"(#loc14))
|
| 354 |
+
#loc126 = loc("tmp0"(#loc15))
|
| 355 |
+
#loc127 = loc("tmp0"(#loc16))
|
| 356 |
+
#loc128 = loc("tmp0"(#loc17))
|
| 357 |
+
#loc129 = loc("tmp0"(#loc18))
|
| 358 |
+
#loc130 = loc("tmp5"(#loc19))
|
| 359 |
+
#loc131 = loc("tmp5"(#loc20))
|
| 360 |
+
#loc132 = loc("tmp5"(#loc21))
|
| 361 |
+
#loc133 = loc("tmp5"(#loc22))
|
| 362 |
+
#loc134 = loc("tmp5"(#loc23))
|
| 363 |
+
#loc135 = loc("tmp3"(#loc24))
|
| 364 |
+
#loc136 = loc("tmp4"(#loc25))
|
| 365 |
+
#loc137 = loc("tmp4"(#loc26))
|
| 366 |
+
#loc138 = loc("tmp6"(#loc27))
|
| 367 |
+
#loc139 = loc("tmp7"(#loc28))
|
| 368 |
+
#loc140 = loc("tmp8"(#loc29))
|
| 369 |
+
#loc141 = loc("tmp9"(#loc30))
|
| 370 |
+
#loc142 = loc("tmp10"(#loc31))
|
| 371 |
+
#loc143 = loc("tmp11"(#loc32))
|
| 372 |
+
#loc144 = loc("tmp12"(#loc33))
|
| 373 |
+
#loc145 = loc("tmp15"(#loc34))
|
| 374 |
+
#loc146 = loc("tmp16"(#loc35))
|
| 375 |
+
#loc147 = loc("tmp16"(#loc36))
|
| 376 |
+
#loc148 = loc("tmp17"(#loc37))
|
| 377 |
+
#loc149 = loc("tmp18"(#loc38))
|
| 378 |
+
#loc150 = loc("tmp19"(#loc39))
|
| 379 |
+
#loc151 = loc("tmp20"(#loc40))
|
| 380 |
+
#loc152 = loc("tmp21"(#loc41))
|
| 381 |
+
#loc153 = loc("tmp22"(#loc42))
|
| 382 |
+
#loc156 = loc("quot"(#loc58))
|
| 383 |
+
#loc157 = loc("remainder"(#loc59))
|
| 384 |
+
#loc158 = loc("fixed"(#loc60))
|
| 385 |
+
#loc159 = loc("fixed"(#loc61))
|
| 386 |
+
#loc160 = loc("fixed"(#loc62))
|
| 387 |
+
#loc164 = loc("mask"(#loc75))
|
| 388 |
+
#loc165 = loc("mask"(#loc78))
|
| 389 |
+
#loc166 = loc("mask"(#loc79))
|
| 390 |
+
#loc172 = loc("mask"(#loc164))
|
| 391 |
+
#loc173 = loc("mask"(#loc166))
|
progress/SpecForge/cache/compiled_kernels/triton/2/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.ttgir
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 2], order = [0, 1]}>
|
| 2 |
+
#blocked1 = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 8], order = [0, 1]}>
|
| 3 |
+
#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":18:0)
|
| 4 |
+
#loc1 = loc(unknown)
|
| 5 |
+
#loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":39:37)
|
| 6 |
+
#loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":49:26)
|
| 7 |
+
#loc61 = loc("in_ptr0"(#loc))
|
| 8 |
+
#loc62 = loc("in_ptr1"(#loc))
|
| 9 |
+
#loc63 = loc("out_ptr0"(#loc))
|
| 10 |
+
#loc64 = loc("out_ptr1"(#loc))
|
| 11 |
+
#loc65 = loc("out_ptr2"(#loc))
|
| 12 |
+
#loc66 = loc("ks0"(#loc))
|
| 13 |
+
#loc67 = loc("xnumel"(#loc))
|
| 14 |
+
#loc68 = loc("r0_numel"(#loc))
|
| 15 |
+
#loc89 = loc("tmp4"(#loc27))
|
| 16 |
+
#loc100 = loc("tmp16"(#loc40))
|
| 17 |
+
#loc116 = loc(callsite(#loc1 at #loc89))
|
| 18 |
+
#loc120 = loc(callsite(#loc1 at #loc100))
|
| 19 |
+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
|
| 20 |
+
tt.func public @triton_per_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 21 |
+
%cst = arith.constant dense<0> : tensor<128x1xi64, #blocked> loc(#loc1)
|
| 22 |
+
%cst_0 = arith.constant dense<0> : tensor<128x1xi32, #blocked> loc(#loc1)
|
| 23 |
+
%cst_1 = arith.constant dense<1> : tensor<128x1xi64, #blocked> loc(#loc1)
|
| 24 |
+
%c32_i64 = arith.constant 32 : i64 loc(#loc1)
|
| 25 |
+
%c128_i32 = arith.constant 128 : i32 loc(#loc1)
|
| 26 |
+
%c0_i64 = arith.constant 0 : i64 loc(#loc1)
|
| 27 |
+
%c1_i64 = arith.constant 1 : i64 loc(#loc1)
|
| 28 |
+
%cst_2 = arith.constant dense<0.693147182> : tensor<128x1xf32, #blocked1> loc(#loc1)
|
| 29 |
+
%cst_3 = arith.constant dense<1.000000e+00> : tensor<128x1xf32, #blocked1> loc(#loc1)
|
| 30 |
+
%cst_4 = arith.constant dense<0xFF800000> : tensor<128x1xf32, #blocked1> loc(#loc1)
|
| 31 |
+
%cst_5 = arith.constant dense<0xFF800000> : tensor<128x32xf32, #blocked1> loc(#loc1)
|
| 32 |
+
%cst_6 = arith.constant dense<0.000000e+00> : tensor<128x32xf32, #blocked1> loc(#loc1)
|
| 33 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc69)
|
| 34 |
+
%xoffset_7 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc70)
|
| 35 |
+
%xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc71)
|
| 36 |
+
%xindex_8 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc71)
|
| 37 |
+
%xindex_9 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xi32, #blocked1> loc(#loc71)
|
| 38 |
+
%xindex_10 = tt.expand_dims %xindex_8 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc71)
|
| 39 |
+
%xindex_11 = tt.splat %xoffset_7 : i32 -> tensor<128x1xi32, #blocked1> loc(#loc72)
|
| 40 |
+
%xindex_12 = tt.splat %xoffset_7 : i32 -> tensor<128x1xi32, #blocked> loc(#loc72)
|
| 41 |
+
%xindex_13 = arith.addi %xindex_11, %xindex_9 : tensor<128x1xi32, #blocked1> loc(#loc72)
|
| 42 |
+
%xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<128x1xi32, #blocked> loc(#loc72)
|
| 43 |
+
%xmask = tt.splat %xnumel : i32 -> tensor<128x1xi32, #blocked1> loc(#loc73)
|
| 44 |
+
%xmask_15 = tt.splat %xnumel : i32 -> tensor<128x1xi32, #blocked> loc(#loc73)
|
| 45 |
+
%xmask_16 = arith.cmpi slt, %xindex_13, %xmask : tensor<128x1xi32, #blocked1> loc(#loc73)
|
| 46 |
+
%xmask_17 = arith.cmpi slt, %xindex_14, %xmask_15 : tensor<128x1xi32, #blocked> loc(#loc73)
|
| 47 |
+
%r0_index = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc74)
|
| 48 |
+
%r0_index_18 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> loc(#loc74)
|
| 49 |
+
%x2 = arith.extsi %xindex_13 : tensor<128x1xi32, #blocked1> to tensor<128x1xi64, #blocked1> loc(#loc75)
|
| 50 |
+
%x2_19 = arith.extsi %xindex_14 : tensor<128x1xi32, #blocked> to tensor<128x1xi64, #blocked> loc(#loc75)
|
| 51 |
+
%x2_20 = tt.splat %ks0 : i64 -> tensor<128x1xi64, #blocked> loc(#loc75)
|
| 52 |
+
%x2_21 = arith.remsi %x2_19, %x2_20 : tensor<128x1xi64, #blocked> loc(#loc75)
|
| 53 |
+
%quot = arith.divsi %x2_19, %x2_20 : tensor<128x1xi64, #blocked> loc(#loc107)
|
| 54 |
+
%fixed = arith.cmpi ne, %x2_21, %cst : tensor<128x1xi64, #blocked> loc(#loc108)
|
| 55 |
+
%fixed_22 = arith.subi %quot, %cst_1 : tensor<128x1xi64, #blocked> loc(#loc109)
|
| 56 |
+
%fixed_23 = arith.select %fixed, %fixed_22, %quot : tensor<128x1xi1, #blocked>, tensor<128x1xi64, #blocked> loc(#loc110)
|
| 57 |
+
%x3 = arith.cmpi slt, %xindex_14, %cst_0 : tensor<128x1xi32, #blocked> loc(#loc111)
|
| 58 |
+
%x3_24 = arith.cmpi slt, %ks0, %c0_i64 : i64 loc(#loc112)
|
| 59 |
+
%x3_25 = tt.splat %x3_24 : i1 -> tensor<128x1xi1, #blocked> loc(#loc113)
|
| 60 |
+
%x3_26 = arith.cmpi ne, %x3, %x3_25 : tensor<128x1xi1, #blocked> loc(#loc113)
|
| 61 |
+
%x3_27 = arith.select %x3_26, %fixed_23, %quot : tensor<128x1xi1, #blocked>, tensor<128x1xi64, #blocked> loc(#loc114)
|
| 62 |
+
%tmp0 = arith.muli %ks0, %c32_i64 : i64 loc(#loc81)
|
| 63 |
+
%tmp0_28 = arith.extsi %r0_index_18 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc82)
|
| 64 |
+
%tmp0_29 = tt.splat %tmp0 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc82)
|
| 65 |
+
%tmp0_30 = arith.muli %tmp0_29, %tmp0_28 : tensor<1x32xi64, #blocked1> loc(#loc82)
|
| 66 |
+
%tmp0_31 = tt.broadcast %x2 : tensor<128x1xi64, #blocked1> -> tensor<128x32xi64, #blocked1> loc(#loc83)
|
| 67 |
+
%tmp0_32 = tt.broadcast %tmp0_30 : tensor<1x32xi64, #blocked1> -> tensor<128x32xi64, #blocked1> loc(#loc83)
|
| 68 |
+
%tmp0_33 = arith.addi %tmp0_31, %tmp0_32 : tensor<128x32xi64, #blocked1> loc(#loc83)
|
| 69 |
+
%tmp0_34 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x32x!tt.ptr<f32>, #blocked1> loc(#loc84)
|
| 70 |
+
%tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<128x32x!tt.ptr<f32>, #blocked1>, tensor<128x32xi64, #blocked1> loc(#loc84)
|
| 71 |
+
%tmp0_36 = tt.broadcast %xmask_16 : tensor<128x1xi1, #blocked1> -> tensor<128x32xi1, #blocked1> loc(#loc85)
|
| 72 |
+
%tmp0_37 = tt.load %tmp0_35, %tmp0_36, %cst_6 : tensor<128x32x!tt.ptr<f32>, #blocked1> loc(#loc85)
|
| 73 |
+
%tmp5 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<128x32x!tt.ptr<f32>, #blocked1> loc(#loc86)
|
| 74 |
+
%tmp5_38 = tt.addptr %tmp5, %tmp0_33 : tensor<128x32x!tt.ptr<f32>, #blocked1>, tensor<128x32xi64, #blocked1> loc(#loc86)
|
| 75 |
+
%tmp5_39 = tt.load %tmp5_38, %tmp0_36, %cst_6 : tensor<128x32x!tt.ptr<f32>, #blocked1> loc(#loc87)
|
| 76 |
+
%tmp3 = arith.select %tmp0_36, %tmp0_37, %cst_5 : tensor<128x32xi1, #blocked1>, tensor<128x32xf32, #blocked1> loc(#loc88)
|
| 77 |
+
%tmp4 = "tt.reduce"(%tmp3) <{axis = 1 : i32}> ({
|
| 78 |
+
^bb0(%tmp4_48: f32 loc(callsite(#loc1 at #loc89)), %tmp4_49: f32 loc(callsite(#loc1 at #loc89))):
|
| 79 |
+
%mask = arith.cmpf ogt, %tmp4_48, %tmp4_49 : f32 loc(#loc121)
|
| 80 |
+
%mask_50 = arith.cmpf une, %tmp4_48, %tmp4_48 : f32 loc(#loc122)
|
| 81 |
+
%mask_51 = arith.ori %mask, %mask_50 : i1 loc(#loc123)
|
| 82 |
+
%tmp4_52 = arith.select %mask_51, %tmp4_48, %tmp4_49 : f32 loc(#loc124)
|
| 83 |
+
tt.reduce.return %tmp4_52 : f32 loc(#loc115)
|
| 84 |
+
}) : (tensor<128x32xf32, #blocked1>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc115)
|
| 85 |
+
%tmp4_40 = ttg.convert_layout %tmp4 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc93)
|
| 86 |
+
%tmp4_41 = tt.expand_dims %tmp4_40 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xf32, #blocked> loc(#loc93)
|
| 87 |
+
%tmp4_42 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xf32, #blocked1> loc(#loc93)
|
| 88 |
+
%tmp7 = arith.cmpf oeq, %tmp4_42, %cst_4 : tensor<128x1xf32, #blocked1> loc(#loc94)
|
| 89 |
+
%tmp8 = tt.broadcast %tmp4_42 : tensor<128x1xf32, #blocked1> -> tensor<128x32xf32, #blocked1> loc(#loc95)
|
| 90 |
+
%tmp8_43 = arith.subf %tmp0_37, %tmp8 : tensor<128x32xf32, #blocked1> loc(#loc95)
|
| 91 |
+
%tmp10 = tt.broadcast %tmp7 : tensor<128x1xi1, #blocked1> -> tensor<128x32xi1, #blocked1> loc(#loc96)
|
| 92 |
+
%tmp10_44 = arith.select %tmp10, %cst_6, %tmp8_43 : tensor<128x32xi1, #blocked1>, tensor<128x32xf32, #blocked1> loc(#loc96)
|
| 93 |
+
%tmp11 = tt.extern_elementwise %tmp10_44 {libname = "", libpath = "", pure = true, symbol = "__nv_exp2f"} : (tensor<128x32xf32, #blocked1>) -> tensor<128x32xf32, #blocked1> loc(#loc97)
|
| 94 |
+
%tmp12 = arith.mulf %tmp5_39, %tmp11 : tensor<128x32xf32, #blocked1> loc(#loc98)
|
| 95 |
+
%tmp15 = arith.select %tmp0_36, %tmp12, %cst_6 : tensor<128x32xi1, #blocked1>, tensor<128x32xf32, #blocked1> loc(#loc99)
|
| 96 |
+
%tmp16 = "tt.reduce"(%tmp15) <{axis = 1 : i32}> ({
|
| 97 |
+
^bb0(%tmp16_48: f32 loc(callsite(#loc1 at #loc100)), %tmp16_49: f32 loc(callsite(#loc1 at #loc100))):
|
| 98 |
+
%tmp16_50 = arith.addf %tmp16_48, %tmp16_49 : f32 loc(#loc125)
|
| 99 |
+
tt.reduce.return %tmp16_50 : f32 loc(#loc119)
|
| 100 |
+
}) : (tensor<128x32xf32, #blocked1>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc119)
|
| 101 |
+
%tmp16_45 = ttg.convert_layout %tmp16 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc101)
|
| 102 |
+
%tmp16_46 = tt.expand_dims %tmp16_45 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xf32, #blocked> loc(#loc101)
|
| 103 |
+
%tmp16_47 = tt.expand_dims %tmp16 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xf32, #blocked1> loc(#loc101)
|
| 104 |
+
%tmp18 = arith.select %tmp7, %cst_3, %tmp16_47 : tensor<128x1xi1, #blocked1>, tensor<128x1xf32, #blocked1> loc(#loc102)
|
| 105 |
+
%tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_log2f"} : (tensor<128x1xf32, #blocked1>) -> tensor<128x1xf32, #blocked1> loc(#loc103)
|
| 106 |
+
%tmp20 = arith.addf %tmp19, %tmp4_42 : tensor<128x1xf32, #blocked1> loc(#loc104)
|
| 107 |
+
%tmp22 = arith.mulf %tmp20, %cst_2 : tensor<128x1xf32, #blocked1> loc(#loc105)
|
| 108 |
+
%0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc47)
|
| 109 |
+
%1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc48)
|
| 110 |
+
%2 = arith.extui %1 : i1 to i64 loc(#loc49)
|
| 111 |
+
%3 = arith.muli %ks0, %2 : i64 loc(#loc49)
|
| 112 |
+
%4 = arith.extui %0 : i1 to i64 loc(#loc106)
|
| 113 |
+
%5 = arith.addi %4, %3 : i64 loc(#loc50)
|
| 114 |
+
%6 = tt.splat %5 : i64 -> tensor<128x1xi64, #blocked> loc(#loc52)
|
| 115 |
+
%7 = arith.muli %x3_27, %6 : tensor<128x1xi64, #blocked> loc(#loc52)
|
| 116 |
+
%8 = arith.addi %x2_21, %7 : tensor<128x1xi64, #blocked> loc(#loc53)
|
| 117 |
+
%9 = tt.splat %out_ptr2 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>, #blocked> loc(#loc54)
|
| 118 |
+
%10 = tt.addptr %9, %8 : tensor<128x1x!tt.ptr<f32>, #blocked>, tensor<128x1xi64, #blocked> loc(#loc54)
|
| 119 |
+
%11 = ttg.convert_layout %tmp22 : tensor<128x1xf32, #blocked1> -> tensor<128x1xf32, #blocked> loc(#loc55)
|
| 120 |
+
tt.store %10, %11, %xmask_17 : tensor<128x1x!tt.ptr<f32>, #blocked> loc(#loc55)
|
| 121 |
+
%12 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>, #blocked> loc(#loc56)
|
| 122 |
+
%13 = tt.addptr %12, %xindex_14 : tensor<128x1x!tt.ptr<f32>, #blocked>, tensor<128x1xi32, #blocked> loc(#loc56)
|
| 123 |
+
tt.store %13, %tmp4_41, %xmask_17 : tensor<128x1x!tt.ptr<f32>, #blocked> loc(#loc57)
|
| 124 |
+
%14 = tt.splat %out_ptr1 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>, #blocked> loc(#loc58)
|
| 125 |
+
%15 = tt.addptr %14, %xindex_14 : tensor<128x1x!tt.ptr<f32>, #blocked>, tensor<128x1xi32, #blocked> loc(#loc58)
|
| 126 |
+
tt.store %15, %tmp16_46, %xmask_17 : tensor<128x1x!tt.ptr<f32>, #blocked> loc(#loc59)
|
| 127 |
+
tt.return loc(#loc60)
|
| 128 |
+
} loc(#loc)
|
| 129 |
+
} loc(#loc)
|
| 130 |
+
#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":23:28)
|
| 131 |
+
#loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":23:33)
|
| 132 |
+
#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":24:44)
|
| 133 |
+
#loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":24:23)
|
| 134 |
+
#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":25:21)
|
| 135 |
+
#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":26:38)
|
| 136 |
+
#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":33:19)
|
| 137 |
+
#loc9 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
|
| 138 |
+
#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":34:51)
|
| 139 |
+
#loc11 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
|
| 140 |
+
#loc12 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
|
| 141 |
+
#loc13 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
|
| 142 |
+
#loc14 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
|
| 143 |
+
#loc15 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
|
| 144 |
+
#loc16 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
|
| 145 |
+
#loc17 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
|
| 146 |
+
#loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":35:38)
|
| 147 |
+
#loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":35:42)
|
| 148 |
+
#loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":35:35)
|
| 149 |
+
#loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":35:30)
|
| 150 |
+
#loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":35:49)
|
| 151 |
+
#loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":36:30)
|
| 152 |
+
#loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":36:49)
|
| 153 |
+
#loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":38:33)
|
| 154 |
+
#loc26 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29)
|
| 155 |
+
#loc28 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15)
|
| 156 |
+
#loc29 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21)
|
| 157 |
+
#loc30 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16)
|
| 158 |
+
#loc31 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29)
|
| 159 |
+
#loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":39:40)
|
| 160 |
+
#loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":41:19)
|
| 161 |
+
#loc34 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":42:18)
|
| 162 |
+
#loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":44:33)
|
| 163 |
+
#loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":45:27)
|
| 164 |
+
#loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":46:19)
|
| 165 |
+
#loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":48:35)
|
| 166 |
+
#loc39 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 167 |
+
#loc41 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 168 |
+
#loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":49:29)
|
| 169 |
+
#loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":51:34)
|
| 170 |
+
#loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":52:27)
|
| 171 |
+
#loc45 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":53:20)
|
| 172 |
+
#loc46 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":55:20)
|
| 173 |
+
#loc47 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:49)
|
| 174 |
+
#loc48 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:75)
|
| 175 |
+
#loc49 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:66)
|
| 176 |
+
#loc50 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:57)
|
| 177 |
+
#loc51 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:41)
|
| 178 |
+
#loc52 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:34)
|
| 179 |
+
#loc53 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:30)
|
| 180 |
+
#loc54 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:25)
|
| 181 |
+
#loc55 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:89)
|
| 182 |
+
#loc56 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":57:25)
|
| 183 |
+
#loc57 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":57:36)
|
| 184 |
+
#loc58 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":58:25)
|
| 185 |
+
#loc59 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":58:37)
|
| 186 |
+
#loc60 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":58:4)
|
| 187 |
+
#loc69 = loc("xoffset"(#loc2))
|
| 188 |
+
#loc70 = loc("xoffset"(#loc3))
|
| 189 |
+
#loc71 = loc("xindex"(#loc4))
|
| 190 |
+
#loc72 = loc("xindex"(#loc5))
|
| 191 |
+
#loc73 = loc("xmask"(#loc6))
|
| 192 |
+
#loc74 = loc("r0_index"(#loc7))
|
| 193 |
+
#loc75 = loc("x2"(#loc8))
|
| 194 |
+
#loc76 = loc("quot"(#loc9))
|
| 195 |
+
#loc77 = loc("x3"(#loc10))
|
| 196 |
+
#loc78 = loc("fixed"(#loc11))
|
| 197 |
+
#loc79 = loc("fixed"(#loc12))
|
| 198 |
+
#loc80 = loc("fixed"(#loc13))
|
| 199 |
+
#loc81 = loc("tmp0"(#loc18))
|
| 200 |
+
#loc82 = loc("tmp0"(#loc19))
|
| 201 |
+
#loc83 = loc("tmp0"(#loc20))
|
| 202 |
+
#loc84 = loc("tmp0"(#loc21))
|
| 203 |
+
#loc85 = loc("tmp0"(#loc22))
|
| 204 |
+
#loc86 = loc("tmp5"(#loc23))
|
| 205 |
+
#loc87 = loc("tmp5"(#loc24))
|
| 206 |
+
#loc88 = loc("tmp3"(#loc25))
|
| 207 |
+
#loc90 = loc("mask"(#loc28))
|
| 208 |
+
#loc91 = loc("mask"(#loc29))
|
| 209 |
+
#loc92 = loc("mask"(#loc30))
|
| 210 |
+
#loc93 = loc("tmp4"(#loc32))
|
| 211 |
+
#loc94 = loc("tmp7"(#loc33))
|
| 212 |
+
#loc95 = loc("tmp8"(#loc34))
|
| 213 |
+
#loc96 = loc("tmp10"(#loc35))
|
| 214 |
+
#loc97 = loc("tmp11"(#loc36))
|
| 215 |
+
#loc98 = loc("tmp12"(#loc37))
|
| 216 |
+
#loc99 = loc("tmp15"(#loc38))
|
| 217 |
+
#loc101 = loc("tmp16"(#loc42))
|
| 218 |
+
#loc102 = loc("tmp18"(#loc43))
|
| 219 |
+
#loc103 = loc("tmp19"(#loc44))
|
| 220 |
+
#loc104 = loc("tmp20"(#loc45))
|
| 221 |
+
#loc105 = loc("tmp22"(#loc46))
|
| 222 |
+
#loc106 = loc(fused[#loc50, #loc51])
|
| 223 |
+
#loc107 = loc(callsite(#loc76 at #loc77))
|
| 224 |
+
#loc108 = loc(callsite(#loc78 at #loc77))
|
| 225 |
+
#loc109 = loc(callsite(#loc79 at #loc77))
|
| 226 |
+
#loc110 = loc(callsite(#loc80 at #loc77))
|
| 227 |
+
#loc111 = loc(callsite(#loc14 at #loc77))
|
| 228 |
+
#loc112 = loc(callsite(#loc15 at #loc77))
|
| 229 |
+
#loc113 = loc(callsite(#loc16 at #loc77))
|
| 230 |
+
#loc114 = loc(callsite(#loc17 at #loc77))
|
| 231 |
+
#loc115 = loc(callsite(#loc26 at #loc89))
|
| 232 |
+
#loc117 = loc("mask"(#loc90))
|
| 233 |
+
#loc118 = loc("mask"(#loc92))
|
| 234 |
+
#loc119 = loc(callsite(#loc39 at #loc100))
|
| 235 |
+
#loc121 = loc(callsite(#loc117 at #loc115))
|
| 236 |
+
#loc122 = loc(callsite(#loc91 at #loc115))
|
| 237 |
+
#loc123 = loc(callsite(#loc118 at #loc115))
|
| 238 |
+
#loc124 = loc(callsite(#loc31 at #loc115))
|
| 239 |
+
#loc125 = loc(callsite(#loc41 at #loc119))
|
progress/SpecForge/cache/compiled_kernels/triton/2/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.ttir
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":18:0)
|
| 2 |
+
#loc6 = loc(unknown)
|
| 3 |
+
#loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":39:37)
|
| 4 |
+
#loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":49:26)
|
| 5 |
+
#loc63 = loc("in_ptr0"(#loc))
|
| 6 |
+
#loc64 = loc("in_ptr1"(#loc))
|
| 7 |
+
#loc65 = loc("out_ptr0"(#loc))
|
| 8 |
+
#loc66 = loc("out_ptr1"(#loc))
|
| 9 |
+
#loc67 = loc("out_ptr2"(#loc))
|
| 10 |
+
#loc68 = loc("ks0"(#loc))
|
| 11 |
+
#loc69 = loc("xnumel"(#loc))
|
| 12 |
+
#loc70 = loc("r0_numel"(#loc))
|
| 13 |
+
#loc96 = loc("tmp4"(#loc32))
|
| 14 |
+
#loc106 = loc("tmp16"(#loc44))
|
| 15 |
+
#loc120 = loc(callsite(#loc6 at #loc96))
|
| 16 |
+
#loc124 = loc(callsite(#loc6 at #loc106))
|
| 17 |
+
module {
|
| 18 |
+
tt.func public @triton_per_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 19 |
+
%fixed = arith.constant dense<1> : tensor<128x1xi64> loc(#loc111)
|
| 20 |
+
%x3 = arith.constant dense<0> : tensor<128x1xi32> loc(#loc112)
|
| 21 |
+
%fixed_0 = arith.constant dense<0> : tensor<128x1xi64> loc(#loc113)
|
| 22 |
+
%x3_1 = arith.constant 0 : i64 loc(#loc114)
|
| 23 |
+
%c1_i64 = arith.constant 1 : i64 loc(#loc6)
|
| 24 |
+
%tmp22 = arith.constant dense<0.693147182> : tensor<128x1xf32> loc(#loc74)
|
| 25 |
+
%tmp18 = arith.constant dense<1.000000e+00> : tensor<128x1xf32> loc(#loc75)
|
| 26 |
+
%tmp7 = arith.constant dense<0xFF800000> : tensor<128x1xf32> loc(#loc76)
|
| 27 |
+
%tmp3 = arith.constant dense<0xFF800000> : tensor<128x32xf32> loc(#loc77)
|
| 28 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<128x32xf32> loc(#loc6)
|
| 29 |
+
%c32_i64 = arith.constant 32 : i64 loc(#loc6)
|
| 30 |
+
%c128_i32 = arith.constant 128 : i32 loc(#loc6)
|
| 31 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc78)
|
| 32 |
+
%xoffset_2 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc79)
|
| 33 |
+
%xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc80)
|
| 34 |
+
%xindex_3 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc81)
|
| 35 |
+
%xindex_4 = tt.splat %xoffset_2 : i32 -> tensor<128x1xi32> loc(#loc82)
|
| 36 |
+
%xindex_5 = arith.addi %xindex_4, %xindex_3 : tensor<128x1xi32> loc(#loc82)
|
| 37 |
+
%xmask = tt.splat %xnumel : i32 -> tensor<128x1xi32> loc(#loc83)
|
| 38 |
+
%xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<128x1xi32> loc(#loc83)
|
| 39 |
+
%r0_index = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc84)
|
| 40 |
+
%r0_index_7 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc85)
|
| 41 |
+
%x2 = arith.extsi %xindex_5 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc86)
|
| 42 |
+
%x2_8 = tt.splat %ks0 : i64 -> tensor<128x1xi64> loc(#loc86)
|
| 43 |
+
%x2_9 = arith.remsi %x2, %x2_8 : tensor<128x1xi64> loc(#loc86)
|
| 44 |
+
%quot = arith.divsi %x2, %x2_8 : tensor<128x1xi64> loc(#loc115)
|
| 45 |
+
%fixed_10 = arith.cmpi ne, %x2_9, %fixed_0 : tensor<128x1xi64> loc(#loc113)
|
| 46 |
+
%fixed_11 = arith.subi %quot, %fixed : tensor<128x1xi64> loc(#loc111)
|
| 47 |
+
%fixed_12 = arith.select %fixed_10, %fixed_11, %quot : tensor<128x1xi1>, tensor<128x1xi64> loc(#loc116)
|
| 48 |
+
%x3_13 = arith.cmpi slt, %xindex_5, %x3 : tensor<128x1xi32> loc(#loc112)
|
| 49 |
+
%x3_14 = arith.cmpi slt, %ks0, %x3_1 : i64 loc(#loc114)
|
| 50 |
+
%x3_15 = tt.splat %x3_14 : i1 -> tensor<128x1xi1> loc(#loc117)
|
| 51 |
+
%x3_16 = arith.cmpi ne, %x3_13, %x3_15 : tensor<128x1xi1> loc(#loc117)
|
| 52 |
+
%x3_17 = arith.select %x3_16, %fixed_12, %quot : tensor<128x1xi1>, tensor<128x1xi64> loc(#loc118)
|
| 53 |
+
%tmp0 = arith.muli %ks0, %c32_i64 : i64 loc(#loc89)
|
| 54 |
+
%tmp0_18 = arith.extsi %r0_index_7 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc90)
|
| 55 |
+
%tmp0_19 = tt.splat %tmp0 : i64 -> tensor<1x32xi64> loc(#loc90)
|
| 56 |
+
%tmp0_20 = arith.muli %tmp0_19, %tmp0_18 : tensor<1x32xi64> loc(#loc90)
|
| 57 |
+
%tmp0_21 = tt.broadcast %x2 : tensor<128x1xi64> -> tensor<128x32xi64> loc(#loc91)
|
| 58 |
+
%tmp0_22 = tt.broadcast %tmp0_20 : tensor<1x32xi64> -> tensor<128x32xi64> loc(#loc91)
|
| 59 |
+
%tmp0_23 = arith.addi %tmp0_21, %tmp0_22 : tensor<128x32xi64> loc(#loc91)
|
| 60 |
+
%tmp0_24 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x32x!tt.ptr<f32>> loc(#loc92)
|
| 61 |
+
%tmp0_25 = tt.addptr %tmp0_24, %tmp0_23 : tensor<128x32x!tt.ptr<f32>>, tensor<128x32xi64> loc(#loc92)
|
| 62 |
+
%tmp0_26 = tt.broadcast %xmask_6 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc93)
|
| 63 |
+
%tmp0_27 = tt.load %tmp0_25, %tmp0_26, %cst : tensor<128x32x!tt.ptr<f32>> loc(#loc93)
|
| 64 |
+
%tmp5 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<128x32x!tt.ptr<f32>> loc(#loc94)
|
| 65 |
+
%tmp5_28 = tt.addptr %tmp5, %tmp0_23 : tensor<128x32x!tt.ptr<f32>>, tensor<128x32xi64> loc(#loc94)
|
| 66 |
+
%tmp5_29 = tt.load %tmp5_28, %tmp0_26, %cst : tensor<128x32x!tt.ptr<f32>> loc(#loc95)
|
| 67 |
+
%tmp3_30 = arith.select %tmp0_26, %tmp0_27, %tmp3 : tensor<128x32xi1>, tensor<128x32xf32> loc(#loc77)
|
| 68 |
+
%tmp4 = "tt.reduce"(%tmp3_30) <{axis = 1 : i32}> ({
|
| 69 |
+
^bb0(%tmp4_38: f32 loc(callsite(#loc6 at #loc96)), %tmp4_39: f32 loc(callsite(#loc6 at #loc96))):
|
| 70 |
+
%mask = arith.cmpf ogt, %tmp4_38, %tmp4_39 : f32 loc(#loc125)
|
| 71 |
+
%mask_40 = arith.cmpf une, %tmp4_38, %tmp4_38 : f32 loc(#loc126)
|
| 72 |
+
%mask_41 = arith.ori %mask, %mask_40 : i1 loc(#loc127)
|
| 73 |
+
%tmp4_42 = arith.select %mask_41, %tmp4_38, %tmp4_39 : f32 loc(#loc128)
|
| 74 |
+
tt.reduce.return %tmp4_42 : f32 loc(#loc119)
|
| 75 |
+
}) : (tensor<128x32xf32>) -> tensor<128xf32> loc(#loc119)
|
| 76 |
+
%tmp4_31 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc100)
|
| 77 |
+
%tmp7_32 = arith.cmpf oeq, %tmp4_31, %tmp7 : tensor<128x1xf32> loc(#loc76)
|
| 78 |
+
%tmp8 = tt.broadcast %tmp4_31 : tensor<128x1xf32> -> tensor<128x32xf32> loc(#loc101)
|
| 79 |
+
%tmp8_33 = arith.subf %tmp0_27, %tmp8 : tensor<128x32xf32> loc(#loc101)
|
| 80 |
+
%tmp10 = tt.broadcast %tmp7_32 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc102)
|
| 81 |
+
%tmp10_34 = arith.select %tmp10, %cst, %tmp8_33 : tensor<128x32xi1>, tensor<128x32xf32> loc(#loc102)
|
| 82 |
+
%tmp11 = tt.extern_elementwise %tmp10_34 {libname = "", libpath = "", pure = true, symbol = "__nv_exp2f"} : (tensor<128x32xf32>) -> tensor<128x32xf32> loc(#loc103)
|
| 83 |
+
%tmp12 = arith.mulf %tmp5_29, %tmp11 : tensor<128x32xf32> loc(#loc104)
|
| 84 |
+
%tmp15 = arith.select %tmp0_26, %tmp12, %cst : tensor<128x32xi1>, tensor<128x32xf32> loc(#loc105)
|
| 85 |
+
%tmp16 = "tt.reduce"(%tmp15) <{axis = 1 : i32}> ({
|
| 86 |
+
^bb0(%tmp16_38: f32 loc(callsite(#loc6 at #loc106)), %tmp16_39: f32 loc(callsite(#loc6 at #loc106))):
|
| 87 |
+
%tmp16_40 = arith.addf %tmp16_38, %tmp16_39 : f32 loc(#loc129)
|
| 88 |
+
tt.reduce.return %tmp16_40 : f32 loc(#loc123)
|
| 89 |
+
}) : (tensor<128x32xf32>) -> tensor<128xf32> loc(#loc123)
|
| 90 |
+
%tmp16_35 = tt.expand_dims %tmp16 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc107)
|
| 91 |
+
%tmp18_36 = arith.select %tmp7_32, %tmp18, %tmp16_35 : tensor<128x1xi1>, tensor<128x1xf32> loc(#loc75)
|
| 92 |
+
%tmp19 = tt.extern_elementwise %tmp18_36 {libname = "", libpath = "", pure = true, symbol = "__nv_log2f"} : (tensor<128x1xf32>) -> tensor<128x1xf32> loc(#loc108)
|
| 93 |
+
%tmp20 = arith.addf %tmp19, %tmp4_31 : tensor<128x1xf32> loc(#loc109)
|
| 94 |
+
%tmp22_37 = arith.mulf %tmp20, %tmp22 : tensor<128x1xf32> loc(#loc74)
|
| 95 |
+
%0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc49)
|
| 96 |
+
%1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc50)
|
| 97 |
+
%2 = arith.extui %1 : i1 to i64 loc(#loc51)
|
| 98 |
+
%3 = arith.muli %ks0, %2 : i64 loc(#loc51)
|
| 99 |
+
%4 = arith.extui %0 : i1 to i64 loc(#loc110)
|
| 100 |
+
%5 = arith.addi %4, %3 : i64 loc(#loc52)
|
| 101 |
+
%6 = tt.splat %5 : i64 -> tensor<128x1xi64> loc(#loc54)
|
| 102 |
+
%7 = arith.muli %x3_17, %6 : tensor<128x1xi64> loc(#loc54)
|
| 103 |
+
%8 = arith.addi %x2_9, %7 : tensor<128x1xi64> loc(#loc55)
|
| 104 |
+
%9 = tt.splat %out_ptr2 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>> loc(#loc56)
|
| 105 |
+
%10 = tt.addptr %9, %8 : tensor<128x1x!tt.ptr<f32>>, tensor<128x1xi64> loc(#loc56)
|
| 106 |
+
tt.store %10, %tmp22_37, %xmask_6 : tensor<128x1x!tt.ptr<f32>> loc(#loc57)
|
| 107 |
+
%11 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>> loc(#loc58)
|
| 108 |
+
%12 = tt.addptr %11, %xindex_5 : tensor<128x1x!tt.ptr<f32>>, tensor<128x1xi32> loc(#loc58)
|
| 109 |
+
tt.store %12, %tmp4_31, %xmask_6 : tensor<128x1x!tt.ptr<f32>> loc(#loc59)
|
| 110 |
+
%13 = tt.splat %out_ptr1 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>> loc(#loc60)
|
| 111 |
+
%14 = tt.addptr %13, %xindex_5 : tensor<128x1x!tt.ptr<f32>>, tensor<128x1xi32> loc(#loc60)
|
| 112 |
+
tt.store %14, %tmp16_35, %xmask_6 : tensor<128x1x!tt.ptr<f32>> loc(#loc61)
|
| 113 |
+
tt.return loc(#loc62)
|
| 114 |
+
} loc(#loc)
|
| 115 |
+
} loc(#loc)
|
| 116 |
+
#loc1 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
|
| 117 |
+
#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":34:51)
|
| 118 |
+
#loc3 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
|
| 119 |
+
#loc4 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
|
| 120 |
+
#loc5 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
|
| 121 |
+
#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":55:20)
|
| 122 |
+
#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":51:34)
|
| 123 |
+
#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":41:19)
|
| 124 |
+
#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":38:33)
|
| 125 |
+
#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":23:28)
|
| 126 |
+
#loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":23:33)
|
| 127 |
+
#loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":24:36)
|
| 128 |
+
#loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":24:44)
|
| 129 |
+
#loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":24:23)
|
| 130 |
+
#loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":25:21)
|
| 131 |
+
#loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":26:28)
|
| 132 |
+
#loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":26:38)
|
| 133 |
+
#loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":33:19)
|
| 134 |
+
#loc20 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
|
| 135 |
+
#loc21 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
|
| 136 |
+
#loc22 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
|
| 137 |
+
#loc23 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
|
| 138 |
+
#loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":35:38)
|
| 139 |
+
#loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":35:42)
|
| 140 |
+
#loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":35:35)
|
| 141 |
+
#loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":35:30)
|
| 142 |
+
#loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":35:49)
|
| 143 |
+
#loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":36:30)
|
| 144 |
+
#loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":36:49)
|
| 145 |
+
#loc31 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29)
|
| 146 |
+
#loc33 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15)
|
| 147 |
+
#loc34 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21)
|
| 148 |
+
#loc35 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16)
|
| 149 |
+
#loc36 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29)
|
| 150 |
+
#loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":39:40)
|
| 151 |
+
#loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":42:18)
|
| 152 |
+
#loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":44:33)
|
| 153 |
+
#loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":45:27)
|
| 154 |
+
#loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":46:19)
|
| 155 |
+
#loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":48:35)
|
| 156 |
+
#loc43 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 157 |
+
#loc45 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 158 |
+
#loc46 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":49:29)
|
| 159 |
+
#loc47 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":52:27)
|
| 160 |
+
#loc48 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":53:20)
|
| 161 |
+
#loc49 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:49)
|
| 162 |
+
#loc50 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:75)
|
| 163 |
+
#loc51 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:66)
|
| 164 |
+
#loc52 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:57)
|
| 165 |
+
#loc53 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:41)
|
| 166 |
+
#loc54 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:34)
|
| 167 |
+
#loc55 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:30)
|
| 168 |
+
#loc56 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:25)
|
| 169 |
+
#loc57 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":56:89)
|
| 170 |
+
#loc58 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":57:25)
|
| 171 |
+
#loc59 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":57:36)
|
| 172 |
+
#loc60 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":58:25)
|
| 173 |
+
#loc61 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":58:37)
|
| 174 |
+
#loc62 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/5c/c5cxskvlmc4hq63kmkcexukwuhivuede7r7tzidpzo5llwdedeem.py":58:4)
|
| 175 |
+
#loc71 = loc("fixed"(#loc1))
|
| 176 |
+
#loc72 = loc("x3"(#loc2))
|
| 177 |
+
#loc73 = loc("fixed"(#loc4))
|
| 178 |
+
#loc74 = loc("tmp22"(#loc7))
|
| 179 |
+
#loc75 = loc("tmp18"(#loc8))
|
| 180 |
+
#loc76 = loc("tmp7"(#loc9))
|
| 181 |
+
#loc77 = loc("tmp3"(#loc10))
|
| 182 |
+
#loc78 = loc("xoffset"(#loc11))
|
| 183 |
+
#loc79 = loc("xoffset"(#loc12))
|
| 184 |
+
#loc80 = loc("xindex"(#loc13))
|
| 185 |
+
#loc81 = loc("xindex"(#loc14))
|
| 186 |
+
#loc82 = loc("xindex"(#loc15))
|
| 187 |
+
#loc83 = loc("xmask"(#loc16))
|
| 188 |
+
#loc84 = loc("r0_index"(#loc17))
|
| 189 |
+
#loc85 = loc("r0_index"(#loc18))
|
| 190 |
+
#loc86 = loc("x2"(#loc19))
|
| 191 |
+
#loc87 = loc("quot"(#loc20))
|
| 192 |
+
#loc88 = loc("fixed"(#loc21))
|
| 193 |
+
#loc89 = loc("tmp0"(#loc24))
|
| 194 |
+
#loc90 = loc("tmp0"(#loc25))
|
| 195 |
+
#loc91 = loc("tmp0"(#loc26))
|
| 196 |
+
#loc92 = loc("tmp0"(#loc27))
|
| 197 |
+
#loc93 = loc("tmp0"(#loc28))
|
| 198 |
+
#loc94 = loc("tmp5"(#loc29))
|
| 199 |
+
#loc95 = loc("tmp5"(#loc30))
|
| 200 |
+
#loc97 = loc("mask"(#loc33))
|
| 201 |
+
#loc98 = loc("mask"(#loc34))
|
| 202 |
+
#loc99 = loc("mask"(#loc35))
|
| 203 |
+
#loc100 = loc("tmp4"(#loc37))
|
| 204 |
+
#loc101 = loc("tmp8"(#loc38))
|
| 205 |
+
#loc102 = loc("tmp10"(#loc39))
|
| 206 |
+
#loc103 = loc("tmp11"(#loc40))
|
| 207 |
+
#loc104 = loc("tmp12"(#loc41))
|
| 208 |
+
#loc105 = loc("tmp15"(#loc42))
|
| 209 |
+
#loc107 = loc("tmp16"(#loc46))
|
| 210 |
+
#loc108 = loc("tmp19"(#loc47))
|
| 211 |
+
#loc109 = loc("tmp20"(#loc48))
|
| 212 |
+
#loc110 = loc(fused[#loc52, #loc53])
|
| 213 |
+
#loc111 = loc(callsite(#loc71 at #loc72))
|
| 214 |
+
#loc112 = loc(callsite(#loc3 at #loc72))
|
| 215 |
+
#loc113 = loc(callsite(#loc73 at #loc72))
|
| 216 |
+
#loc114 = loc(callsite(#loc5 at #loc72))
|
| 217 |
+
#loc115 = loc(callsite(#loc87 at #loc72))
|
| 218 |
+
#loc116 = loc(callsite(#loc88 at #loc72))
|
| 219 |
+
#loc117 = loc(callsite(#loc22 at #loc72))
|
| 220 |
+
#loc118 = loc(callsite(#loc23 at #loc72))
|
| 221 |
+
#loc119 = loc(callsite(#loc31 at #loc96))
|
| 222 |
+
#loc121 = loc("mask"(#loc97))
|
| 223 |
+
#loc122 = loc("mask"(#loc99))
|
| 224 |
+
#loc123 = loc(callsite(#loc43 at #loc106))
|
| 225 |
+
#loc125 = loc(callsite(#loc121 at #loc119))
|
| 226 |
+
#loc126 = loc(callsite(#loc98 at #loc119))
|
| 227 |
+
#loc127 = loc(callsite(#loc122 at #loc119))
|
| 228 |
+
#loc128 = loc(callsite(#loc36 at #loc119))
|
| 229 |
+
#loc129 = loc(callsite(#loc45 at #loc123))
|
progress/SpecForge/cache/compiled_kernels/triton/2/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/__grp__triton_poi_fused_mul_1.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"child_paths": {"triton_poi_fused_mul_1.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.source", "triton_poi_fused_mul_1.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttir", "triton_poi_fused_mul_1.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttgir", "triton_poi_fused_mul_1.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.llir", "triton_poi_fused_mul_1.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ptx", "triton_poi_fused_mul_1.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.cubin", "triton_poi_fused_mul_1.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.json"}}
|
progress/SpecForge/cache/compiled_kernels/triton/2/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.cubin
ADDED
|
Binary file (10.3 kB). View file
|
|
|
progress/SpecForge/cache/compiled_kernels/triton/2/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"hash": "d650530c018e98a61be4958bef98391bf5f4932885981bbdd4c94cc375f6e8e4", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_mul_1"}
|
progress/SpecForge/cache/compiled_kernels/triton/2/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.llir
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
; ModuleID = 'LLVMDialectModule'
|
| 2 |
+
source_filename = "LLVMDialectModule"
|
| 3 |
+
target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
| 4 |
+
|
| 5 |
+
; Function Attrs: nounwind
|
| 6 |
+
define ptx_kernel void @triton_poi_fused_mul_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
|
| 7 |
+
%7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
|
| 8 |
+
%8 = shl i32 %7, 7, !dbg !8
|
| 9 |
+
%9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
|
| 10 |
+
%10 = and i32 %9, 127, !dbg !9
|
| 11 |
+
%11 = or disjoint i32 %8, %10, !dbg !10
|
| 12 |
+
%12 = icmp slt i32 %11, %3, !dbg !11
|
| 13 |
+
%13 = sext i32 %11 to i64, !dbg !12
|
| 14 |
+
%.frozen = freeze i64 %2, !dbg !13
|
| 15 |
+
%14 = sdiv i64 %13, %.frozen, !dbg !13
|
| 16 |
+
%15 = mul i64 %14, %.frozen, !dbg !12
|
| 17 |
+
%.decomposed = sub i64 %13, %15, !dbg !12
|
| 18 |
+
%.not = icmp ne i64 %.decomposed, 0, !dbg !17
|
| 19 |
+
%16 = icmp slt i32 %8, 0, !dbg !18
|
| 20 |
+
%17 = icmp slt i64 %2, 0, !dbg !19
|
| 21 |
+
%18 = xor i1 %16, %17, !dbg !20
|
| 22 |
+
%narrow = select i1 %18, i1 %.not, i1 false, !dbg !21
|
| 23 |
+
%19 = sext i1 %narrow to i64, !dbg !21
|
| 24 |
+
%20 = add nsw i64 %14, %19, !dbg !21
|
| 25 |
+
%21 = getelementptr float, ptr addrspace(1) %0, i64 %13, !dbg !22
|
| 26 |
+
%22 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !23
|
| 27 |
+
%23 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %21, i64 %22, i1 %12) #2, !dbg !23
|
| 28 |
+
%24 = bitcast i32 %23 to float, !dbg !23
|
| 29 |
+
%25 = fmul float %24, 0x3FE62E4300000000, !dbg !24
|
| 30 |
+
%26 = icmp slt i64 %2, 2, !dbg !25
|
| 31 |
+
%27 = icmp sgt i64 %2, 1, !dbg !26
|
| 32 |
+
%28 = select i1 %27, i64 %2, i64 0, !dbg !27
|
| 33 |
+
%29 = zext i1 %26 to i64, !dbg !28
|
| 34 |
+
%30 = add i64 %28, %29, !dbg !29
|
| 35 |
+
%31 = mul i64 %20, %30, !dbg !30
|
| 36 |
+
%32 = getelementptr float, ptr addrspace(1) %1, i64 %.decomposed, !dbg !31
|
| 37 |
+
%33 = getelementptr float, ptr addrspace(1) %32, i64 %31, !dbg !31
|
| 38 |
+
%34 = bitcast float %25 to i32, !dbg !32
|
| 39 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %34, ptr addrspace(1) %33, i1 %12) #2, !dbg !32
|
| 40 |
+
ret void, !dbg !33
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 44 |
+
declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
|
| 45 |
+
|
| 46 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 47 |
+
declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
|
| 48 |
+
|
| 49 |
+
attributes #0 = { nounwind "nvvm.reqntid"="128" }
|
| 50 |
+
attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
| 51 |
+
attributes #2 = { nounwind }
|
| 52 |
+
|
| 53 |
+
!llvm.dbg.cu = !{!0}
|
| 54 |
+
!llvm.module.flags = !{!2, !3}
|
| 55 |
+
|
| 56 |
+
!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
|
| 57 |
+
!1 = !DIFile(filename: "c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py", directory: "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j")
|
| 58 |
+
!2 = !{i32 2, !"Debug Info Version", i32 3}
|
| 59 |
+
!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
| 60 |
+
!4 = distinct !DISubprogram(name: "triton_poi_fused_mul_1", linkageName: "triton_poi_fused_mul_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
|
| 61 |
+
!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
|
| 62 |
+
!6 = !{}
|
| 63 |
+
!7 = !DILocation(line: 19, column: 28, scope: !4)
|
| 64 |
+
!8 = !DILocation(line: 19, column: 33, scope: !4)
|
| 65 |
+
!9 = !DILocation(line: 20, column: 36, scope: !4)
|
| 66 |
+
!10 = !DILocation(line: 20, column: 23, scope: !4)
|
| 67 |
+
!11 = !DILocation(line: 21, column: 21, scope: !4)
|
| 68 |
+
!12 = !DILocation(line: 23, column: 19, scope: !4)
|
| 69 |
+
!13 = !DILocation(line: 72, column: 16, scope: !14, inlinedAt: !16)
|
| 70 |
+
!14 = distinct !DILexicalBlockFile(scope: !4, file: !15, discriminator: 0)
|
| 71 |
+
!15 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime")
|
| 72 |
+
!16 = !DILocation(line: 24, column: 51, scope: !4)
|
| 73 |
+
!17 = !DILocation(line: 74, column: 34, scope: !14, inlinedAt: !16)
|
| 74 |
+
!18 = !DILocation(line: 75, column: 25, scope: !14, inlinedAt: !16)
|
| 75 |
+
!19 = !DILocation(line: 75, column: 36, scope: !14, inlinedAt: !16)
|
| 76 |
+
!20 = !DILocation(line: 75, column: 32, scope: !14, inlinedAt: !16)
|
| 77 |
+
!21 = !DILocation(line: 75, column: 47, scope: !14, inlinedAt: !16)
|
| 78 |
+
!22 = !DILocation(line: 25, column: 30, scope: !4)
|
| 79 |
+
!23 = !DILocation(line: 25, column: 35, scope: !4)
|
| 80 |
+
!24 = !DILocation(line: 27, column: 18, scope: !4)
|
| 81 |
+
!25 = !DILocation(line: 28, column: 49, scope: !4)
|
| 82 |
+
!26 = !DILocation(line: 28, column: 75, scope: !4)
|
| 83 |
+
!27 = !DILocation(line: 28, column: 66, scope: !4)
|
| 84 |
+
!28 = !DILocation(line: 28, scope: !4)
|
| 85 |
+
!29 = !DILocation(line: 28, column: 57, scope: !4)
|
| 86 |
+
!30 = !DILocation(line: 28, column: 34, scope: !4)
|
| 87 |
+
!31 = !DILocation(line: 28, column: 25, scope: !4)
|
| 88 |
+
!32 = !DILocation(line: 28, column: 88, scope: !4)
|
| 89 |
+
!33 = !DILocation(line: 28, column: 4, scope: !4)
|
progress/SpecForge/cache/compiled_kernels/triton/2/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ptx
ADDED
|
@@ -0,0 +1,357 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// Generated by LLVM NVPTX Back-End
|
| 3 |
+
//
|
| 4 |
+
|
| 5 |
+
.version 8.7
|
| 6 |
+
.target sm_90a
|
| 7 |
+
.address_size 64
|
| 8 |
+
|
| 9 |
+
// .globl triton_poi_fused_mul_1 // -- Begin function triton_poi_fused_mul_1
|
| 10 |
+
// @triton_poi_fused_mul_1
|
| 11 |
+
.visible .entry triton_poi_fused_mul_1(
|
| 12 |
+
.param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_0,
|
| 13 |
+
.param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_1,
|
| 14 |
+
.param .u64 triton_poi_fused_mul_1_param_2,
|
| 15 |
+
.param .u32 triton_poi_fused_mul_1_param_3,
|
| 16 |
+
.param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_4,
|
| 17 |
+
.param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_5
|
| 18 |
+
)
|
| 19 |
+
.reqntid 128
|
| 20 |
+
{
|
| 21 |
+
.reg .pred %p<11>;
|
| 22 |
+
.reg .b32 %r<13>;
|
| 23 |
+
.reg .b64 %rd<30>;
|
| 24 |
+
.loc 1 18 0 // c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:18:0
|
| 25 |
+
$L__func_begin0:
|
| 26 |
+
.loc 1 18 0 // c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:18:0
|
| 27 |
+
|
| 28 |
+
// %bb.0:
|
| 29 |
+
ld.param.b32 %r2, [triton_poi_fused_mul_1_param_3];
|
| 30 |
+
ld.param.b64 %rd7, [triton_poi_fused_mul_1_param_1];
|
| 31 |
+
ld.param.b64 %rd6, [triton_poi_fused_mul_1_param_0];
|
| 32 |
+
ld.param.b64 %rd8, [triton_poi_fused_mul_1_param_2];
|
| 33 |
+
$L__tmp0:
|
| 34 |
+
.loc 1 19 28 // c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:19:28
|
| 35 |
+
mov.u32 %r3, %ctaid.x;
|
| 36 |
+
.loc 1 19 33 // c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:19:33
|
| 37 |
+
shl.b32 %r1, %r3, 7;
|
| 38 |
+
.loc 1 20 36 // c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:20:36
|
| 39 |
+
mov.u32 %r4, %tid.x;
|
| 40 |
+
and.b32 %r5, %r4, 127;
|
| 41 |
+
.loc 1 20 23 // c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:20:23
|
| 42 |
+
or.b32 %r6, %r1, %r5;
|
| 43 |
+
.loc 1 23 19 // c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:23:19
|
| 44 |
+
cvt.s64.s32 %rd1, %r6;
|
| 45 |
+
$L__tmp1:
|
| 46 |
+
.loc 2 72 16 // triton_helpers.py:72:16 @[ c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:24:51 ]
|
| 47 |
+
or.b64 %rd10, %rd1, %rd8;
|
| 48 |
+
and.b64 %rd11, %rd10, -4294967296;
|
| 49 |
+
setp.ne.b64 %p1, %rd11, 0;
|
| 50 |
+
@%p1 bra $L__BB0_2;
|
| 51 |
+
bra.uni $L__BB0_1;
|
| 52 |
+
$L__BB0_2:
|
| 53 |
+
div.s64 %rd29, %rd1, %rd8;
|
| 54 |
+
bra.uni $L__BB0_3;
|
| 55 |
+
$L__BB0_1:
|
| 56 |
+
cvt.u32.u64 %r7, %rd8;
|
| 57 |
+
cvt.u32.u64 %r8, %rd1;
|
| 58 |
+
div.u32 %r9, %r8, %r7;
|
| 59 |
+
cvt.u64.u32 %rd29, %r9;
|
| 60 |
+
$L__tmp2:
|
| 61 |
+
$L__BB0_3:
|
| 62 |
+
.loc 2 0 16 // triton_helpers.py:0:16
|
| 63 |
+
cvt.u32.u64 %r12, %rd1;
|
| 64 |
+
.loc 1 21 21 // c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:21:21
|
| 65 |
+
setp.lt.s32 %p2, %r12, %r2;
|
| 66 |
+
.loc 1 23 19 // c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:23:19
|
| 67 |
+
mul.lo.s64 %rd17, %rd29, %rd8;
|
| 68 |
+
sub.s64 %rd18, %rd1, %rd17;
|
| 69 |
+
$L__tmp3:
|
| 70 |
+
.loc 2 74 34 // triton_helpers.py:74:34 @[ c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:24:51 ]
|
| 71 |
+
setp.ne.b64 %p4, %rd18, 0;
|
| 72 |
+
.loc 2 75 25 // triton_helpers.py:75:25 @[ c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:24:51 ]
|
| 73 |
+
setp.lt.s32 %p5, %r1, 0;
|
| 74 |
+
.loc 2 75 36 // triton_helpers.py:75:36 @[ c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:24:51 ]
|
| 75 |
+
setp.lt.s64 %p6, %rd8, 0;
|
| 76 |
+
.loc 2 75 32 // triton_helpers.py:75:32 @[ c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:24:51 ]
|
| 77 |
+
xor.pred %p7, %p5, %p6;
|
| 78 |
+
.loc 2 75 47 // triton_helpers.py:75:47 @[ c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:24:51 ]
|
| 79 |
+
and.pred %p8, %p7, %p4;
|
| 80 |
+
selp.b64 %rd19, -1, 0, %p8;
|
| 81 |
+
add.s64 %rd20, %rd29, %rd19;
|
| 82 |
+
$L__tmp4:
|
| 83 |
+
.loc 1 25 30 // c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:25:30
|
| 84 |
+
shl.b64 %rd21, %rd1, 2;
|
| 85 |
+
add.s64 %rd13, %rd6, %rd21;
|
| 86 |
+
.loc 1 25 35 // c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:25:35
|
| 87 |
+
// begin inline asm
|
| 88 |
+
mov.u64 %rd14, 0x0;
|
| 89 |
+
createpolicy.fractional.L2::evict_last.b64 %rd14, 1.0;
|
| 90 |
+
// end inline asm
|
| 91 |
+
// begin inline asm
|
| 92 |
+
mov.u32 %r10, 0x0;
|
| 93 |
+
@%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r10 }, [ %rd13 + 0 ], %rd14;
|
| 94 |
+
// end inline asm
|
| 95 |
+
.loc 1 27 18 // c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:27:18
|
| 96 |
+
mul.f32 %r11, %r10, 0f3F317218;
|
| 97 |
+
.loc 1 28 49 // c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:28:49
|
| 98 |
+
setp.lt.s64 %p9, %rd8, 2;
|
| 99 |
+
.loc 1 28 75 // c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:28:75
|
| 100 |
+
setp.gt.s64 %p10, %rd8, 1;
|
| 101 |
+
.loc 1 28 66 // c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:28:66
|
| 102 |
+
selp.b64 %rd22, %rd8, 0, %p10;
|
| 103 |
+
.loc 1 28 0 // c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:28
|
| 104 |
+
selp.b64 %rd23, 1, 0, %p9;
|
| 105 |
+
.loc 1 28 57 // c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:28:57
|
| 106 |
+
add.s64 %rd24, %rd22, %rd23;
|
| 107 |
+
.loc 1 28 34 // c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:28:34
|
| 108 |
+
mul.lo.s64 %rd25, %rd20, %rd24;
|
| 109 |
+
.loc 1 28 25 // c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:28:25
|
| 110 |
+
shl.b64 %rd26, %rd18, 2;
|
| 111 |
+
add.s64 %rd27, %rd7, %rd26;
|
| 112 |
+
shl.b64 %rd28, %rd25, 2;
|
| 113 |
+
add.s64 %rd15, %rd27, %rd28;
|
| 114 |
+
.loc 1 28 88 // c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:28:88
|
| 115 |
+
// begin inline asm
|
| 116 |
+
@%p2 st.global.b32 [ %rd15 + 0 ], { %r11 };
|
| 117 |
+
// end inline asm
|
| 118 |
+
.loc 1 28 4 // c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py:28:4
|
| 119 |
+
ret;
|
| 120 |
+
$L__tmp5:
|
| 121 |
+
$L__func_end0:
|
| 122 |
+
// -- End function
|
| 123 |
+
}
|
| 124 |
+
.file 1 "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py"
|
| 125 |
+
.file 2 "/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py"
|
| 126 |
+
.section .debug_abbrev
|
| 127 |
+
{
|
| 128 |
+
.b8 1 // Abbreviation Code
|
| 129 |
+
.b8 17 // DW_TAG_compile_unit
|
| 130 |
+
.b8 1 // DW_CHILDREN_yes
|
| 131 |
+
.b8 37 // DW_AT_producer
|
| 132 |
+
.b8 8 // DW_FORM_string
|
| 133 |
+
.b8 19 // DW_AT_language
|
| 134 |
+
.b8 5 // DW_FORM_data2
|
| 135 |
+
.b8 3 // DW_AT_name
|
| 136 |
+
.b8 8 // DW_FORM_string
|
| 137 |
+
.b8 16 // DW_AT_stmt_list
|
| 138 |
+
.b8 6 // DW_FORM_data4
|
| 139 |
+
.b8 27 // DW_AT_comp_dir
|
| 140 |
+
.b8 8 // DW_FORM_string
|
| 141 |
+
.b8 0 // EOM(1)
|
| 142 |
+
.b8 0 // EOM(2)
|
| 143 |
+
.b8 2 // Abbreviation Code
|
| 144 |
+
.b8 46 // DW_TAG_subprogram
|
| 145 |
+
.b8 0 // DW_CHILDREN_no
|
| 146 |
+
.b8 3 // DW_AT_name
|
| 147 |
+
.b8 8 // DW_FORM_string
|
| 148 |
+
.b8 32 // DW_AT_inline
|
| 149 |
+
.b8 11 // DW_FORM_data1
|
| 150 |
+
.b8 0 // EOM(1)
|
| 151 |
+
.b8 0 // EOM(2)
|
| 152 |
+
.b8 3 // Abbreviation Code
|
| 153 |
+
.b8 46 // DW_TAG_subprogram
|
| 154 |
+
.b8 1 // DW_CHILDREN_yes
|
| 155 |
+
.b8 17 // DW_AT_low_pc
|
| 156 |
+
.b8 1 // DW_FORM_addr
|
| 157 |
+
.b8 18 // DW_AT_high_pc
|
| 158 |
+
.b8 1 // DW_FORM_addr
|
| 159 |
+
.b8 49 // DW_AT_abstract_origin
|
| 160 |
+
.b8 19 // DW_FORM_ref4
|
| 161 |
+
.b8 0 // EOM(1)
|
| 162 |
+
.b8 0 // EOM(2)
|
| 163 |
+
.b8 4 // Abbreviation Code
|
| 164 |
+
.b8 29 // DW_TAG_inlined_subroutine
|
| 165 |
+
.b8 0 // DW_CHILDREN_no
|
| 166 |
+
.b8 49 // DW_AT_abstract_origin
|
| 167 |
+
.b8 19 // DW_FORM_ref4
|
| 168 |
+
.b8 17 // DW_AT_low_pc
|
| 169 |
+
.b8 1 // DW_FORM_addr
|
| 170 |
+
.b8 18 // DW_AT_high_pc
|
| 171 |
+
.b8 1 // DW_FORM_addr
|
| 172 |
+
.b8 88 // DW_AT_call_file
|
| 173 |
+
.b8 11 // DW_FORM_data1
|
| 174 |
+
.b8 89 // DW_AT_call_line
|
| 175 |
+
.b8 11 // DW_FORM_data1
|
| 176 |
+
.b8 87 // DW_AT_call_column
|
| 177 |
+
.b8 11 // DW_FORM_data1
|
| 178 |
+
.b8 0 // EOM(1)
|
| 179 |
+
.b8 0 // EOM(2)
|
| 180 |
+
.b8 0 // EOM(3)
|
| 181 |
+
}
|
| 182 |
+
.section .debug_info
|
| 183 |
+
{
|
| 184 |
+
.b32 211 // Length of Unit
|
| 185 |
+
.b8 2 // DWARF version number
|
| 186 |
+
.b8 0
|
| 187 |
+
.b32 .debug_abbrev // Offset Into Abbrev. Section
|
| 188 |
+
.b8 8 // Address Size (in bytes)
|
| 189 |
+
.b8 1 // Abbrev [1] 0xb:0xcc DW_TAG_compile_unit
|
| 190 |
+
.b8 116 // DW_AT_producer
|
| 191 |
+
.b8 114
|
| 192 |
+
.b8 105
|
| 193 |
+
.b8 116
|
| 194 |
+
.b8 111
|
| 195 |
+
.b8 110
|
| 196 |
+
.b8 0
|
| 197 |
+
.b8 2 // DW_AT_language
|
| 198 |
+
.b8 0
|
| 199 |
+
.b8 99 // DW_AT_name
|
| 200 |
+
.b8 50
|
| 201 |
+
.b8 106
|
| 202 |
+
.b8 51
|
| 203 |
+
.b8 109
|
| 204 |
+
.b8 116
|
| 205 |
+
.b8 107
|
| 206 |
+
.b8 51
|
| 207 |
+
.b8 116
|
| 208 |
+
.b8 104
|
| 209 |
+
.b8 105
|
| 210 |
+
.b8 54
|
| 211 |
+
.b8 115
|
| 212 |
+
.b8 110
|
| 213 |
+
.b8 50
|
| 214 |
+
.b8 104
|
| 215 |
+
.b8 120
|
| 216 |
+
.b8 105
|
| 217 |
+
.b8 117
|
| 218 |
+
.b8 104
|
| 219 |
+
.b8 117
|
| 220 |
+
.b8 105
|
| 221 |
+
.b8 103
|
| 222 |
+
.b8 106
|
| 223 |
+
.b8 119
|
| 224 |
+
.b8 52
|
| 225 |
+
.b8 51
|
| 226 |
+
.b8 115
|
| 227 |
+
.b8 112
|
| 228 |
+
.b8 105
|
| 229 |
+
.b8 117
|
| 230 |
+
.b8 55
|
| 231 |
+
.b8 52
|
| 232 |
+
.b8 109
|
| 233 |
+
.b8 120
|
| 234 |
+
.b8 100
|
| 235 |
+
.b8 101
|
| 236 |
+
.b8 114
|
| 237 |
+
.b8 118
|
| 238 |
+
.b8 112
|
| 239 |
+
.b8 103
|
| 240 |
+
.b8 112
|
| 241 |
+
.b8 102
|
| 242 |
+
.b8 114
|
| 243 |
+
.b8 116
|
| 244 |
+
.b8 111
|
| 245 |
+
.b8 115
|
| 246 |
+
.b8 55
|
| 247 |
+
.b8 117
|
| 248 |
+
.b8 50
|
| 249 |
+
.b8 113
|
| 250 |
+
.b8 104
|
| 251 |
+
.b8 46
|
| 252 |
+
.b8 112
|
| 253 |
+
.b8 121
|
| 254 |
+
.b8 0
|
| 255 |
+
.b32 .debug_line // DW_AT_stmt_list
|
| 256 |
+
.b8 47 // DW_AT_comp_dir
|
| 257 |
+
.b8 119
|
| 258 |
+
.b8 111
|
| 259 |
+
.b8 114
|
| 260 |
+
.b8 107
|
| 261 |
+
.b8 115
|
| 262 |
+
.b8 112
|
| 263 |
+
.b8 97
|
| 264 |
+
.b8 99
|
| 265 |
+
.b8 101
|
| 266 |
+
.b8 47
|
| 267 |
+
.b8 104
|
| 268 |
+
.b8 97
|
| 269 |
+
.b8 110
|
| 270 |
+
.b8 114
|
| 271 |
+
.b8 117
|
| 272 |
+
.b8 105
|
| 273 |
+
.b8 47
|
| 274 |
+
.b8 106
|
| 275 |
+
.b8 117
|
| 276 |
+
.b8 110
|
| 277 |
+
.b8 113
|
| 278 |
+
.b8 117
|
| 279 |
+
.b8 97
|
| 280 |
+
.b8 110
|
| 281 |
+
.b8 47
|
| 282 |
+
.b8 83
|
| 283 |
+
.b8 112
|
| 284 |
+
.b8 101
|
| 285 |
+
.b8 99
|
| 286 |
+
.b8 70
|
| 287 |
+
.b8 111
|
| 288 |
+
.b8 114
|
| 289 |
+
.b8 103
|
| 290 |
+
.b8 101
|
| 291 |
+
.b8 47
|
| 292 |
+
.b8 99
|
| 293 |
+
.b8 97
|
| 294 |
+
.b8 99
|
| 295 |
+
.b8 104
|
| 296 |
+
.b8 101
|
| 297 |
+
.b8 47
|
| 298 |
+
.b8 99
|
| 299 |
+
.b8 111
|
| 300 |
+
.b8 109
|
| 301 |
+
.b8 112
|
| 302 |
+
.b8 105
|
| 303 |
+
.b8 108
|
| 304 |
+
.b8 101
|
| 305 |
+
.b8 100
|
| 306 |
+
.b8 95
|
| 307 |
+
.b8 107
|
| 308 |
+
.b8 101
|
| 309 |
+
.b8 114
|
| 310 |
+
.b8 110
|
| 311 |
+
.b8 101
|
| 312 |
+
.b8 108
|
| 313 |
+
.b8 115
|
| 314 |
+
.b8 47
|
| 315 |
+
.b8 50
|
| 316 |
+
.b8 106
|
| 317 |
+
.b8 0
|
| 318 |
+
.b8 2 // Abbrev [2] 0x8f:0x19 DW_TAG_subprogram
|
| 319 |
+
.b8 116 // DW_AT_name
|
| 320 |
+
.b8 114
|
| 321 |
+
.b8 105
|
| 322 |
+
.b8 116
|
| 323 |
+
.b8 111
|
| 324 |
+
.b8 110
|
| 325 |
+
.b8 95
|
| 326 |
+
.b8 112
|
| 327 |
+
.b8 111
|
| 328 |
+
.b8 105
|
| 329 |
+
.b8 95
|
| 330 |
+
.b8 102
|
| 331 |
+
.b8 117
|
| 332 |
+
.b8 115
|
| 333 |
+
.b8 101
|
| 334 |
+
.b8 100
|
| 335 |
+
.b8 95
|
| 336 |
+
.b8 109
|
| 337 |
+
.b8 117
|
| 338 |
+
.b8 108
|
| 339 |
+
.b8 95
|
| 340 |
+
.b8 49
|
| 341 |
+
.b8 0
|
| 342 |
+
.b8 1 // DW_AT_inline
|
| 343 |
+
.b8 3 // Abbrev [3] 0xa8:0x2e DW_TAG_subprogram
|
| 344 |
+
.b64 $L__func_begin0 // DW_AT_low_pc
|
| 345 |
+
.b64 $L__func_end0 // DW_AT_high_pc
|
| 346 |
+
.b32 143 // DW_AT_abstract_origin
|
| 347 |
+
.b8 4 // Abbrev [4] 0xbd:0x18 DW_TAG_inlined_subroutine
|
| 348 |
+
.b32 143 // DW_AT_abstract_origin
|
| 349 |
+
.b64 $L__tmp1 // DW_AT_low_pc
|
| 350 |
+
.b64 $L__tmp4 // DW_AT_high_pc
|
| 351 |
+
.b8 1 // DW_AT_call_file
|
| 352 |
+
.b8 24 // DW_AT_call_line
|
| 353 |
+
.b8 51 // DW_AT_call_column
|
| 354 |
+
.b8 0 // End Of Children Mark
|
| 355 |
+
.b8 0 // End Of Children Mark
|
| 356 |
+
}
|
| 357 |
+
.section .debug_macinfo { }
|
progress/SpecForge/cache/compiled_kernels/triton/2/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.source
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":18:0)
|
| 2 |
+
#loc22 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":69:0)
|
| 3 |
+
#loc34 = loc("in_ptr0"(#loc))
|
| 4 |
+
#loc35 = loc("out_ptr0"(#loc))
|
| 5 |
+
#loc36 = loc("ks0"(#loc))
|
| 6 |
+
#loc37 = loc("xnumel"(#loc))
|
| 7 |
+
#loc49 = loc("a"(#loc22))
|
| 8 |
+
#loc50 = loc("b"(#loc22))
|
| 9 |
+
module {
|
| 10 |
+
tt.func public @triton_poi_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
|
| 11 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc38)
|
| 12 |
+
%xoffset_0 = arith.constant 128 : i32 loc(#loc39)
|
| 13 |
+
%xoffset_1 = arith.constant 128 : i32 loc(#loc39)
|
| 14 |
+
%xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc39)
|
| 15 |
+
%xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc40)
|
| 16 |
+
%xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<128xi32> loc(#loc41)
|
| 17 |
+
%xindex_4 = arith.addi %xindex_3, %xindex : tensor<128xi32> loc(#loc41)
|
| 18 |
+
%xmask = tt.splat %xnumel : i32 -> tensor<128xi32> loc(#loc42)
|
| 19 |
+
%xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<128xi32> loc(#loc42)
|
| 20 |
+
%x0 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc43)
|
| 21 |
+
%x0_6 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc43)
|
| 22 |
+
%x0_7 = arith.remsi %x0, %x0_6 : tensor<128xi64> loc(#loc43)
|
| 23 |
+
%x1 = tt.call @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S128S_i64__(%xindex_4, %ks0) : (tensor<128xi32>, i64) -> tensor<128xi64> loc(#loc44)
|
| 24 |
+
%tmp0 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>> loc(#loc45)
|
| 25 |
+
%tmp0_8 = tt.addptr %tmp0, %xindex_4 : tensor<128x!tt.ptr<f32>>, tensor<128xi32> loc(#loc45)
|
| 26 |
+
%tmp0_9 = tt.load %tmp0_8, %xmask_5 evictionPolicy = evict_last : tensor<128x!tt.ptr<f32>> loc(#loc46)
|
| 27 |
+
%tmp1 = arith.constant 0.693147182 : f32 loc(#loc47)
|
| 28 |
+
%tmp2 = arith.constant dense<0.693147182> : tensor<128xf32> loc(#loc48)
|
| 29 |
+
%tmp2_10 = arith.mulf %tmp0_9, %tmp2 : tensor<128xf32> loc(#loc48)
|
| 30 |
+
%c1_i32 = arith.constant 1 : i32 loc(#loc12)
|
| 31 |
+
%0 = arith.extsi %c1_i32 : i32 to i64 loc(#loc12)
|
| 32 |
+
%1 = arith.cmpi sge, %0, %ks0 : i64 loc(#loc12)
|
| 33 |
+
%c1_i32_11 = arith.constant 1 : i32 loc(#loc13)
|
| 34 |
+
%c1_i32_12 = arith.constant 1 : i32 loc(#loc13)
|
| 35 |
+
%2 = arith.extui %1 : i1 to i32 loc(#loc13)
|
| 36 |
+
%3 = arith.muli %c1_i32_12, %2 : i32 loc(#loc13)
|
| 37 |
+
%c1_i32_13 = arith.constant 1 : i32 loc(#loc14)
|
| 38 |
+
%4 = arith.extsi %c1_i32_13 : i32 to i64 loc(#loc14)
|
| 39 |
+
%5 = arith.cmpi sgt, %ks0, %4 : i64 loc(#loc14)
|
| 40 |
+
%6 = arith.extui %5 : i1 to i64 loc(#loc15)
|
| 41 |
+
%7 = arith.muli %ks0, %6 : i64 loc(#loc15)
|
| 42 |
+
%8 = arith.extsi %3 : i32 to i64 loc(#loc16)
|
| 43 |
+
%9 = arith.addi %8, %7 : i64 loc(#loc16)
|
| 44 |
+
%10 = tt.splat %9 : i64 -> tensor<128xi64> loc(#loc17)
|
| 45 |
+
%11 = arith.muli %x1, %10 : tensor<128xi64> loc(#loc17)
|
| 46 |
+
%12 = arith.addi %x0_7, %11 : tensor<128xi64> loc(#loc18)
|
| 47 |
+
%13 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>> loc(#loc19)
|
| 48 |
+
%14 = tt.addptr %13, %12 : tensor<128x!tt.ptr<f32>>, tensor<128xi64> loc(#loc19)
|
| 49 |
+
tt.store %14, %tmp2_10, %xmask_5 : tensor<128x!tt.ptr<f32>> loc(#loc20)
|
| 50 |
+
tt.return loc(#loc21)
|
| 51 |
+
} loc(#loc)
|
| 52 |
+
tt.func private @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S128S_i64__(%a: tensor<128xi32> loc("a"(#loc22)), %b: i64 loc("b"(#loc22))) -> tensor<128xi64> attributes {noinline = false} {
|
| 53 |
+
%quot = arith.extsi %a : tensor<128xi32> to tensor<128xi64> loc(#loc51)
|
| 54 |
+
%quot_0 = tt.splat %b : i64 -> tensor<128xi64> loc(#loc51)
|
| 55 |
+
%quot_1 = arith.divsi %quot, %quot_0 : tensor<128xi64> loc(#loc51)
|
| 56 |
+
%remainder = arith.extsi %a : tensor<128xi32> to tensor<128xi64> loc(#loc52)
|
| 57 |
+
%remainder_2 = tt.splat %b : i64 -> tensor<128xi64> loc(#loc52)
|
| 58 |
+
%remainder_3 = arith.remsi %remainder, %remainder_2 : tensor<128xi64> loc(#loc52)
|
| 59 |
+
%fixed = arith.constant 0 : i32 loc(#loc53)
|
| 60 |
+
%fixed_4 = arith.extsi %fixed : i32 to i64 loc(#loc53)
|
| 61 |
+
%fixed_5 = tt.splat %fixed_4 : i64 -> tensor<128xi64> loc(#loc53)
|
| 62 |
+
%fixed_6 = arith.cmpi ne, %remainder_3, %fixed_5 : tensor<128xi64> loc(#loc53)
|
| 63 |
+
%fixed_7 = arith.constant 1 : i32 loc(#loc54)
|
| 64 |
+
%fixed_8 = arith.constant 1 : i64 loc(#loc54)
|
| 65 |
+
%fixed_9 = arith.constant dense<1> : tensor<128xi64> loc(#loc54)
|
| 66 |
+
%fixed_10 = arith.subi %quot_1, %fixed_9 : tensor<128xi64> loc(#loc54)
|
| 67 |
+
%fixed_11 = arith.select %fixed_6, %fixed_10, %quot_1 : tensor<128xi1>, tensor<128xi64> loc(#loc55)
|
| 68 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc28)
|
| 69 |
+
%cst = arith.constant dense<0> : tensor<128xi32> loc(#loc28)
|
| 70 |
+
%0 = arith.cmpi slt, %a, %cst : tensor<128xi32> loc(#loc28)
|
| 71 |
+
%c0_i32_12 = arith.constant 0 : i32 loc(#loc29)
|
| 72 |
+
%1 = arith.extsi %c0_i32_12 : i32 to i64 loc(#loc29)
|
| 73 |
+
%2 = arith.cmpi slt, %b, %1 : i64 loc(#loc29)
|
| 74 |
+
%3 = tt.splat %2 : i1 -> tensor<128xi1> loc(#loc30)
|
| 75 |
+
%4 = arith.cmpi ne, %0, %3 : tensor<128xi1> loc(#loc30)
|
| 76 |
+
%5 = arith.select %4, %fixed_11, %quot_1 : tensor<128xi1>, tensor<128xi64> loc(#loc31)
|
| 77 |
+
tt.return %5 : tensor<128xi64> loc(#loc32)
|
| 78 |
+
^bb1: // no predecessors
|
| 79 |
+
%6 = ub.poison : tensor<128xi64> loc(#loc33)
|
| 80 |
+
tt.return %6 : tensor<128xi64> loc(#loc33)
|
| 81 |
+
} loc(#loc22)
|
| 82 |
+
} loc(#loc)
|
| 83 |
+
#loc1 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":19:28)
|
| 84 |
+
#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":19:33)
|
| 85 |
+
#loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":20:36)
|
| 86 |
+
#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":20:23)
|
| 87 |
+
#loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":21:21)
|
| 88 |
+
#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":23:19)
|
| 89 |
+
#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":24:51)
|
| 90 |
+
#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":25:30)
|
| 91 |
+
#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":25:35)
|
| 92 |
+
#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":26:11)
|
| 93 |
+
#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":27:18)
|
| 94 |
+
#loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:49)
|
| 95 |
+
#loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:41)
|
| 96 |
+
#loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:75)
|
| 97 |
+
#loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:66)
|
| 98 |
+
#loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:57)
|
| 99 |
+
#loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:34)
|
| 100 |
+
#loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:30)
|
| 101 |
+
#loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:25)
|
| 102 |
+
#loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:88)
|
| 103 |
+
#loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:4)
|
| 104 |
+
#loc23 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
|
| 105 |
+
#loc24 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20)
|
| 106 |
+
#loc25 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
|
| 107 |
+
#loc26 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
|
| 108 |
+
#loc27 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
|
| 109 |
+
#loc28 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
|
| 110 |
+
#loc29 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
|
| 111 |
+
#loc30 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
|
| 112 |
+
#loc31 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
|
| 113 |
+
#loc32 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:11)
|
| 114 |
+
#loc33 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:4)
|
| 115 |
+
#loc38 = loc("xoffset"(#loc1))
|
| 116 |
+
#loc39 = loc("xoffset"(#loc2))
|
| 117 |
+
#loc40 = loc("xindex"(#loc3))
|
| 118 |
+
#loc41 = loc("xindex"(#loc4))
|
| 119 |
+
#loc42 = loc("xmask"(#loc5))
|
| 120 |
+
#loc43 = loc("x0"(#loc6))
|
| 121 |
+
#loc44 = loc("x1"(#loc7))
|
| 122 |
+
#loc45 = loc("tmp0"(#loc8))
|
| 123 |
+
#loc46 = loc("tmp0"(#loc9))
|
| 124 |
+
#loc47 = loc("tmp1"(#loc10))
|
| 125 |
+
#loc48 = loc("tmp2"(#loc11))
|
| 126 |
+
#loc51 = loc("quot"(#loc23))
|
| 127 |
+
#loc52 = loc("remainder"(#loc24))
|
| 128 |
+
#loc53 = loc("fixed"(#loc25))
|
| 129 |
+
#loc54 = loc("fixed"(#loc26))
|
| 130 |
+
#loc55 = loc("fixed"(#loc27))
|
progress/SpecForge/cache/compiled_kernels/triton/2/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttgir
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
|
| 2 |
+
#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":18:0)
|
| 3 |
+
#loc30 = loc("in_ptr0"(#loc))
|
| 4 |
+
#loc31 = loc("out_ptr0"(#loc))
|
| 5 |
+
#loc32 = loc("ks0"(#loc))
|
| 6 |
+
#loc33 = loc("xnumel"(#loc))
|
| 7 |
+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
|
| 8 |
+
tt.func public @triton_poi_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
|
| 9 |
+
%c128_i32 = arith.constant 128 : i32 loc(#loc1)
|
| 10 |
+
%cst = arith.constant dense<0.693147182> : tensor<128xf32, #blocked> loc(#loc1)
|
| 11 |
+
%c1_i64 = arith.constant 1 : i64 loc(#loc1)
|
| 12 |
+
%c0_i64 = arith.constant 0 : i64 loc(#loc1)
|
| 13 |
+
%cst_0 = arith.constant dense<0> : tensor<128xi64, #blocked> loc(#loc1)
|
| 14 |
+
%cst_1 = arith.constant dense<0> : tensor<128xi32, #blocked> loc(#loc1)
|
| 15 |
+
%cst_2 = arith.constant dense<1> : tensor<128xi64, #blocked> loc(#loc1)
|
| 16 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc34)
|
| 17 |
+
%xoffset_3 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc35)
|
| 18 |
+
%xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked> loc(#loc36)
|
| 19 |
+
%xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<128xi32, #blocked> loc(#loc37)
|
| 20 |
+
%xindex_5 = arith.addi %xindex_4, %xindex : tensor<128xi32, #blocked> loc(#loc37)
|
| 21 |
+
%xmask = tt.splat %xnumel : i32 -> tensor<128xi32, #blocked> loc(#loc38)
|
| 22 |
+
%xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<128xi32, #blocked> loc(#loc38)
|
| 23 |
+
%x0 = arith.extsi %xindex_5 : tensor<128xi32, #blocked> to tensor<128xi64, #blocked> loc(#loc39)
|
| 24 |
+
%x0_7 = tt.splat %ks0 : i64 -> tensor<128xi64, #blocked> loc(#loc39)
|
| 25 |
+
%x0_8 = arith.remsi %x0, %x0_7 : tensor<128xi64, #blocked> loc(#loc39)
|
| 26 |
+
%quot = arith.divsi %x0, %x0_7 : tensor<128xi64, #blocked> loc(#loc49)
|
| 27 |
+
%fixed = arith.cmpi ne, %x0_8, %cst_0 : tensor<128xi64, #blocked> loc(#loc50)
|
| 28 |
+
%fixed_9 = arith.subi %quot, %cst_2 : tensor<128xi64, #blocked> loc(#loc51)
|
| 29 |
+
%fixed_10 = arith.select %fixed, %fixed_9, %quot : tensor<128xi1, #blocked>, tensor<128xi64, #blocked> loc(#loc52)
|
| 30 |
+
%x1 = arith.cmpi slt, %xindex_5, %cst_1 : tensor<128xi32, #blocked> loc(#loc53)
|
| 31 |
+
%x1_11 = arith.cmpi slt, %ks0, %c0_i64 : i64 loc(#loc54)
|
| 32 |
+
%x1_12 = tt.splat %x1_11 : i1 -> tensor<128xi1, #blocked> loc(#loc55)
|
| 33 |
+
%x1_13 = arith.cmpi ne, %x1, %x1_12 : tensor<128xi1, #blocked> loc(#loc55)
|
| 34 |
+
%x1_14 = arith.select %x1_13, %fixed_10, %quot : tensor<128xi1, #blocked>, tensor<128xi64, #blocked> loc(#loc56)
|
| 35 |
+
%tmp0 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>, #blocked> loc(#loc45)
|
| 36 |
+
%tmp0_15 = tt.addptr %tmp0, %xindex_5 : tensor<128x!tt.ptr<f32>, #blocked>, tensor<128xi32, #blocked> loc(#loc45)
|
| 37 |
+
%tmp0_16 = tt.load %tmp0_15, %xmask_6 evictionPolicy = evict_last : tensor<128x!tt.ptr<f32>, #blocked> loc(#loc46)
|
| 38 |
+
%tmp2 = arith.mulf %tmp0_16, %cst : tensor<128xf32, #blocked> loc(#loc47)
|
| 39 |
+
%0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc20)
|
| 40 |
+
%1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc21)
|
| 41 |
+
%2 = arith.extui %1 : i1 to i64 loc(#loc22)
|
| 42 |
+
%3 = arith.muli %ks0, %2 : i64 loc(#loc22)
|
| 43 |
+
%4 = arith.extui %0 : i1 to i64 loc(#loc48)
|
| 44 |
+
%5 = arith.addi %4, %3 : i64 loc(#loc23)
|
| 45 |
+
%6 = tt.splat %5 : i64 -> tensor<128xi64, #blocked> loc(#loc25)
|
| 46 |
+
%7 = arith.muli %x1_14, %6 : tensor<128xi64, #blocked> loc(#loc25)
|
| 47 |
+
%8 = arith.addi %x0_8, %7 : tensor<128xi64, #blocked> loc(#loc26)
|
| 48 |
+
%9 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>, #blocked> loc(#loc27)
|
| 49 |
+
%10 = tt.addptr %9, %8 : tensor<128x!tt.ptr<f32>, #blocked>, tensor<128xi64, #blocked> loc(#loc27)
|
| 50 |
+
tt.store %10, %tmp2, %xmask_6 : tensor<128x!tt.ptr<f32>, #blocked> loc(#loc28)
|
| 51 |
+
tt.return loc(#loc29)
|
| 52 |
+
} loc(#loc)
|
| 53 |
+
} loc(#loc)
|
| 54 |
+
#loc1 = loc(unknown)
|
| 55 |
+
#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":19:28)
|
| 56 |
+
#loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":19:33)
|
| 57 |
+
#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":20:36)
|
| 58 |
+
#loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":20:23)
|
| 59 |
+
#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":21:21)
|
| 60 |
+
#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":23:19)
|
| 61 |
+
#loc8 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
|
| 62 |
+
#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":24:51)
|
| 63 |
+
#loc10 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
|
| 64 |
+
#loc11 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
|
| 65 |
+
#loc12 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
|
| 66 |
+
#loc13 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
|
| 67 |
+
#loc14 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
|
| 68 |
+
#loc15 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
|
| 69 |
+
#loc16 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
|
| 70 |
+
#loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":25:30)
|
| 71 |
+
#loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":25:35)
|
| 72 |
+
#loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":27:18)
|
| 73 |
+
#loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:49)
|
| 74 |
+
#loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:75)
|
| 75 |
+
#loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:66)
|
| 76 |
+
#loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:57)
|
| 77 |
+
#loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:41)
|
| 78 |
+
#loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:34)
|
| 79 |
+
#loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:30)
|
| 80 |
+
#loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:25)
|
| 81 |
+
#loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:88)
|
| 82 |
+
#loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:4)
|
| 83 |
+
#loc34 = loc("xoffset"(#loc2))
|
| 84 |
+
#loc35 = loc("xoffset"(#loc3))
|
| 85 |
+
#loc36 = loc("xindex"(#loc4))
|
| 86 |
+
#loc37 = loc("xindex"(#loc5))
|
| 87 |
+
#loc38 = loc("xmask"(#loc6))
|
| 88 |
+
#loc39 = loc("x0"(#loc7))
|
| 89 |
+
#loc40 = loc("quot"(#loc8))
|
| 90 |
+
#loc41 = loc("x1"(#loc9))
|
| 91 |
+
#loc42 = loc("fixed"(#loc10))
|
| 92 |
+
#loc43 = loc("fixed"(#loc11))
|
| 93 |
+
#loc44 = loc("fixed"(#loc12))
|
| 94 |
+
#loc45 = loc("tmp0"(#loc17))
|
| 95 |
+
#loc46 = loc("tmp0"(#loc18))
|
| 96 |
+
#loc47 = loc("tmp2"(#loc19))
|
| 97 |
+
#loc48 = loc(fused[#loc23, #loc24])
|
| 98 |
+
#loc49 = loc(callsite(#loc40 at #loc41))
|
| 99 |
+
#loc50 = loc(callsite(#loc42 at #loc41))
|
| 100 |
+
#loc51 = loc(callsite(#loc43 at #loc41))
|
| 101 |
+
#loc52 = loc(callsite(#loc44 at #loc41))
|
| 102 |
+
#loc53 = loc(callsite(#loc13 at #loc41))
|
| 103 |
+
#loc54 = loc(callsite(#loc14 at #loc41))
|
| 104 |
+
#loc55 = loc(callsite(#loc15 at #loc41))
|
| 105 |
+
#loc56 = loc(callsite(#loc16 at #loc41))
|
progress/SpecForge/cache/compiled_kernels/triton/2/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttir
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":18:0)
|
| 2 |
+
#loc30 = loc("in_ptr0"(#loc))
|
| 3 |
+
#loc31 = loc("out_ptr0"(#loc))
|
| 4 |
+
#loc32 = loc("ks0"(#loc))
|
| 5 |
+
#loc33 = loc("xnumel"(#loc))
|
| 6 |
+
module {
|
| 7 |
+
tt.func public @triton_poi_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
|
| 8 |
+
%fixed = arith.constant dense<1> : tensor<128xi64> loc(#loc49)
|
| 9 |
+
%x1 = arith.constant dense<0> : tensor<128xi32> loc(#loc50)
|
| 10 |
+
%fixed_0 = arith.constant dense<0> : tensor<128xi64> loc(#loc51)
|
| 11 |
+
%x1_1 = arith.constant 0 : i64 loc(#loc52)
|
| 12 |
+
%c1_i64 = arith.constant 1 : i64 loc(#loc6)
|
| 13 |
+
%tmp2 = arith.constant dense<0.693147182> : tensor<128xf32> loc(#loc37)
|
| 14 |
+
%c128_i32 = arith.constant 128 : i32 loc(#loc6)
|
| 15 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc38)
|
| 16 |
+
%xoffset_2 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc39)
|
| 17 |
+
%xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc40)
|
| 18 |
+
%xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<128xi32> loc(#loc41)
|
| 19 |
+
%xindex_4 = arith.addi %xindex_3, %xindex : tensor<128xi32> loc(#loc41)
|
| 20 |
+
%xmask = tt.splat %xnumel : i32 -> tensor<128xi32> loc(#loc42)
|
| 21 |
+
%xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<128xi32> loc(#loc42)
|
| 22 |
+
%x0 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc43)
|
| 23 |
+
%x0_6 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc43)
|
| 24 |
+
%x0_7 = arith.remsi %x0, %x0_6 : tensor<128xi64> loc(#loc43)
|
| 25 |
+
%quot = arith.divsi %x0, %x0_6 : tensor<128xi64> loc(#loc53)
|
| 26 |
+
%fixed_8 = arith.cmpi ne, %x0_7, %fixed_0 : tensor<128xi64> loc(#loc51)
|
| 27 |
+
%fixed_9 = arith.subi %quot, %fixed : tensor<128xi64> loc(#loc49)
|
| 28 |
+
%fixed_10 = arith.select %fixed_8, %fixed_9, %quot : tensor<128xi1>, tensor<128xi64> loc(#loc54)
|
| 29 |
+
%x1_11 = arith.cmpi slt, %xindex_4, %x1 : tensor<128xi32> loc(#loc50)
|
| 30 |
+
%x1_12 = arith.cmpi slt, %ks0, %x1_1 : i64 loc(#loc52)
|
| 31 |
+
%x1_13 = tt.splat %x1_12 : i1 -> tensor<128xi1> loc(#loc55)
|
| 32 |
+
%x1_14 = arith.cmpi ne, %x1_11, %x1_13 : tensor<128xi1> loc(#loc55)
|
| 33 |
+
%x1_15 = arith.select %x1_14, %fixed_10, %quot : tensor<128xi1>, tensor<128xi64> loc(#loc56)
|
| 34 |
+
%tmp0 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>> loc(#loc46)
|
| 35 |
+
%tmp0_16 = tt.addptr %tmp0, %xindex_4 : tensor<128x!tt.ptr<f32>>, tensor<128xi32> loc(#loc46)
|
| 36 |
+
%tmp0_17 = tt.load %tmp0_16, %xmask_5 evictionPolicy = evict_last : tensor<128x!tt.ptr<f32>> loc(#loc47)
|
| 37 |
+
%tmp2_18 = arith.mulf %tmp0_17, %tmp2 : tensor<128xf32> loc(#loc37)
|
| 38 |
+
%0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc20)
|
| 39 |
+
%1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc21)
|
| 40 |
+
%2 = arith.extui %1 : i1 to i64 loc(#loc22)
|
| 41 |
+
%3 = arith.muli %ks0, %2 : i64 loc(#loc22)
|
| 42 |
+
%4 = arith.extui %0 : i1 to i64 loc(#loc48)
|
| 43 |
+
%5 = arith.addi %4, %3 : i64 loc(#loc23)
|
| 44 |
+
%6 = tt.splat %5 : i64 -> tensor<128xi64> loc(#loc25)
|
| 45 |
+
%7 = arith.muli %x1_15, %6 : tensor<128xi64> loc(#loc25)
|
| 46 |
+
%8 = arith.addi %x0_7, %7 : tensor<128xi64> loc(#loc26)
|
| 47 |
+
%9 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>> loc(#loc27)
|
| 48 |
+
%10 = tt.addptr %9, %8 : tensor<128x!tt.ptr<f32>>, tensor<128xi64> loc(#loc27)
|
| 49 |
+
tt.store %10, %tmp2_18, %xmask_5 : tensor<128x!tt.ptr<f32>> loc(#loc28)
|
| 50 |
+
tt.return loc(#loc29)
|
| 51 |
+
} loc(#loc)
|
| 52 |
+
} loc(#loc)
|
| 53 |
+
#loc1 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
|
| 54 |
+
#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":24:51)
|
| 55 |
+
#loc3 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
|
| 56 |
+
#loc4 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
|
| 57 |
+
#loc5 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
|
| 58 |
+
#loc6 = loc(unknown)
|
| 59 |
+
#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":27:18)
|
| 60 |
+
#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":19:28)
|
| 61 |
+
#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":19:33)
|
| 62 |
+
#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":20:36)
|
| 63 |
+
#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":20:23)
|
| 64 |
+
#loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":21:21)
|
| 65 |
+
#loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":23:19)
|
| 66 |
+
#loc14 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
|
| 67 |
+
#loc15 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
|
| 68 |
+
#loc16 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
|
| 69 |
+
#loc17 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
|
| 70 |
+
#loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":25:30)
|
| 71 |
+
#loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":25:35)
|
| 72 |
+
#loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:49)
|
| 73 |
+
#loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:75)
|
| 74 |
+
#loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:66)
|
| 75 |
+
#loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:57)
|
| 76 |
+
#loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:41)
|
| 77 |
+
#loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:34)
|
| 78 |
+
#loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:30)
|
| 79 |
+
#loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:25)
|
| 80 |
+
#loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:88)
|
| 81 |
+
#loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/2j/c2j3mtk3thi6sn2hxiuhuigjw43spiu74mxdervpgpfrtos7u2qh.py":28:4)
|
| 82 |
+
#loc34 = loc("fixed"(#loc1))
|
| 83 |
+
#loc35 = loc("x1"(#loc2))
|
| 84 |
+
#loc36 = loc("fixed"(#loc4))
|
| 85 |
+
#loc37 = loc("tmp2"(#loc7))
|
| 86 |
+
#loc38 = loc("xoffset"(#loc8))
|
| 87 |
+
#loc39 = loc("xoffset"(#loc9))
|
| 88 |
+
#loc40 = loc("xindex"(#loc10))
|
| 89 |
+
#loc41 = loc("xindex"(#loc11))
|
| 90 |
+
#loc42 = loc("xmask"(#loc12))
|
| 91 |
+
#loc43 = loc("x0"(#loc13))
|
| 92 |
+
#loc44 = loc("quot"(#loc14))
|
| 93 |
+
#loc45 = loc("fixed"(#loc15))
|
| 94 |
+
#loc46 = loc("tmp0"(#loc18))
|
| 95 |
+
#loc47 = loc("tmp0"(#loc19))
|
| 96 |
+
#loc48 = loc(fused[#loc23, #loc24])
|
| 97 |
+
#loc49 = loc(callsite(#loc34 at #loc35))
|
| 98 |
+
#loc50 = loc(callsite(#loc3 at #loc35))
|
| 99 |
+
#loc51 = loc(callsite(#loc36 at #loc35))
|
| 100 |
+
#loc52 = loc(callsite(#loc5 at #loc35))
|
| 101 |
+
#loc53 = loc(callsite(#loc44 at #loc35))
|
| 102 |
+
#loc54 = loc(callsite(#loc45 at #loc35))
|
| 103 |
+
#loc55 = loc(callsite(#loc16 at #loc35))
|
| 104 |
+
#loc56 = loc(callsite(#loc17 at #loc35))
|
progress/SpecForge/cache/compiled_kernels/triton/2/4WLUGGKP4KNR26REGICEZ2CG72LG46AK5KFUG6JJKIDMZRATGGHQ/__grp__triton_poi_fused_mul_1.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"child_paths": {"triton_poi_fused_mul_1.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/4WLUGGKP4KNR26REGICEZ2CG72LG46AK5KFUG6JJKIDMZRATGGHQ/triton_poi_fused_mul_1.source", "triton_poi_fused_mul_1.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/4WLUGGKP4KNR26REGICEZ2CG72LG46AK5KFUG6JJKIDMZRATGGHQ/triton_poi_fused_mul_1.ttir", "triton_poi_fused_mul_1.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/4WLUGGKP4KNR26REGICEZ2CG72LG46AK5KFUG6JJKIDMZRATGGHQ/triton_poi_fused_mul_1.ttgir", "triton_poi_fused_mul_1.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/4WLUGGKP4KNR26REGICEZ2CG72LG46AK5KFUG6JJKIDMZRATGGHQ/triton_poi_fused_mul_1.llir", "triton_poi_fused_mul_1.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/4WLUGGKP4KNR26REGICEZ2CG72LG46AK5KFUG6JJKIDMZRATGGHQ/triton_poi_fused_mul_1.ptx", "triton_poi_fused_mul_1.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/4WLUGGKP4KNR26REGICEZ2CG72LG46AK5KFUG6JJKIDMZRATGGHQ/triton_poi_fused_mul_1.cubin", "triton_poi_fused_mul_1.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/4WLUGGKP4KNR26REGICEZ2CG72LG46AK5KFUG6JJKIDMZRATGGHQ/triton_poi_fused_mul_1.json"}}
|
progress/SpecForge/cache/compiled_kernels/triton/2/4WLUGGKP4KNR26REGICEZ2CG72LG46AK5KFUG6JJKIDMZRATGGHQ/triton_poi_fused_mul_1.cubin
ADDED
|
Binary file (5.9 kB). View file
|
|
|
progress/SpecForge/cache/compiled_kernels/triton/2/4WLUGGKP4KNR26REGICEZ2CG72LG46AK5KFUG6JJKIDMZRATGGHQ/triton_poi_fused_mul_1.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"hash": "e59743194fe29b1d7a2432044ce846fe966e780aea8b4379295206ccc413318f", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_mul_1"}
|
progress/SpecForge/cache/compiled_kernels/triton/2/4WLUGGKP4KNR26REGICEZ2CG72LG46AK5KFUG6JJKIDMZRATGGHQ/triton_poi_fused_mul_1.llir
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
; ModuleID = 'LLVMDialectModule'
|
| 2 |
+
source_filename = "LLVMDialectModule"
|
| 3 |
+
target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
| 4 |
+
|
| 5 |
+
; Function Attrs: nounwind
|
| 6 |
+
define ptx_kernel void @triton_poi_fused_mul_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 {
|
| 7 |
+
%6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
|
| 8 |
+
%7 = shl i32 %6, 8, !dbg !8
|
| 9 |
+
%8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
|
| 10 |
+
%9 = shl nuw nsw i32 %8, 1, !dbg !9
|
| 11 |
+
%10 = and i32 %9, 254, !dbg !9
|
| 12 |
+
%11 = or disjoint i32 %10, %7, !dbg !10
|
| 13 |
+
%12 = icmp slt i32 %11, 29184, !dbg !11
|
| 14 |
+
%13 = sext i32 %11 to i64, !dbg !12
|
| 15 |
+
%14 = getelementptr float, ptr addrspace(1) %0, i64 %13, !dbg !12
|
| 16 |
+
%15 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %14, i1 %12) #2, !dbg !13
|
| 17 |
+
%16 = extractvalue { i32, i32 } %15, 0, !dbg !13
|
| 18 |
+
%17 = extractvalue { i32, i32 } %15, 1, !dbg !13
|
| 19 |
+
%18 = bitcast i32 %16 to float, !dbg !13
|
| 20 |
+
%19 = bitcast i32 %17 to float, !dbg !13
|
| 21 |
+
%20 = fmul float %18, 0x3FE62E4300000000, !dbg !14
|
| 22 |
+
%21 = fmul float %19, 0x3FE62E4300000000, !dbg !14
|
| 23 |
+
%22 = getelementptr float, ptr addrspace(1) %1, i64 %13, !dbg !15
|
| 24 |
+
%23 = bitcast float %20 to i32, !dbg !16
|
| 25 |
+
%24 = bitcast float %21 to i32, !dbg !16
|
| 26 |
+
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %23, i32 %24, ptr addrspace(1) %22, i1 %12) #2, !dbg !16
|
| 27 |
+
ret void, !dbg !17
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 31 |
+
declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
|
| 32 |
+
|
| 33 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 34 |
+
declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
|
| 35 |
+
|
| 36 |
+
attributes #0 = { nounwind "nvvm.reqntid"="128" }
|
| 37 |
+
attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
| 38 |
+
attributes #2 = { nounwind }
|
| 39 |
+
|
| 40 |
+
!llvm.dbg.cu = !{!0}
|
| 41 |
+
!llvm.module.flags = !{!2, !3}
|
| 42 |
+
|
| 43 |
+
!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
|
| 44 |
+
!1 = !DIFile(filename: "cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py", directory: "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6")
|
| 45 |
+
!2 = !{i32 2, !"Debug Info Version", i32 3}
|
| 46 |
+
!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
| 47 |
+
!4 = distinct !DISubprogram(name: "triton_poi_fused_mul_1", linkageName: "triton_poi_fused_mul_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
|
| 48 |
+
!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
|
| 49 |
+
!6 = !{}
|
| 50 |
+
!7 = !DILocation(line: 20, column: 28, scope: !4)
|
| 51 |
+
!8 = !DILocation(line: 20, column: 33, scope: !4)
|
| 52 |
+
!9 = !DILocation(line: 21, column: 36, scope: !4)
|
| 53 |
+
!10 = !DILocation(line: 21, column: 23, scope: !4)
|
| 54 |
+
!11 = !DILocation(line: 22, column: 21, scope: !4)
|
| 55 |
+
!12 = !DILocation(line: 24, column: 30, scope: !4)
|
| 56 |
+
!13 = !DILocation(line: 24, column: 35, scope: !4)
|
| 57 |
+
!14 = !DILocation(line: 26, column: 18, scope: !4)
|
| 58 |
+
!15 = !DILocation(line: 27, column: 25, scope: !4)
|
| 59 |
+
!16 = !DILocation(line: 27, column: 36, scope: !4)
|
| 60 |
+
!17 = !DILocation(line: 27, column: 4, scope: !4)
|
progress/SpecForge/cache/compiled_kernels/triton/2/4WLUGGKP4KNR26REGICEZ2CG72LG46AK5KFUG6JJKIDMZRATGGHQ/triton_poi_fused_mul_1.ptx
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// Generated by LLVM NVPTX Back-End
|
| 3 |
+
//
|
| 4 |
+
|
| 5 |
+
.version 8.7
|
| 6 |
+
.target sm_90a
|
| 7 |
+
.address_size 64
|
| 8 |
+
|
| 9 |
+
// .globl triton_poi_fused_mul_1 // -- Begin function triton_poi_fused_mul_1
|
| 10 |
+
// @triton_poi_fused_mul_1
|
| 11 |
+
.visible .entry triton_poi_fused_mul_1(
|
| 12 |
+
.param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_0,
|
| 13 |
+
.param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_1,
|
| 14 |
+
.param .u32 triton_poi_fused_mul_1_param_2,
|
| 15 |
+
.param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_3,
|
| 16 |
+
.param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_4
|
| 17 |
+
)
|
| 18 |
+
.reqntid 128
|
| 19 |
+
{
|
| 20 |
+
.reg .pred %p<3>;
|
| 21 |
+
.reg .b32 %r<11>;
|
| 22 |
+
.reg .b64 %rd<6>;
|
| 23 |
+
.loc 1 18 0 // cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py:18:0
|
| 24 |
+
$L__func_begin0:
|
| 25 |
+
.loc 1 18 0 // cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py:18:0
|
| 26 |
+
|
| 27 |
+
// %bb.0:
|
| 28 |
+
ld.param.b64 %rd3, [triton_poi_fused_mul_1_param_0];
|
| 29 |
+
ld.param.b64 %rd4, [triton_poi_fused_mul_1_param_1];
|
| 30 |
+
$L__tmp0:
|
| 31 |
+
.loc 1 20 28 // cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py:20:28
|
| 32 |
+
mov.u32 %r5, %ctaid.x;
|
| 33 |
+
.loc 1 20 33 // cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py:20:33
|
| 34 |
+
shl.b32 %r6, %r5, 8;
|
| 35 |
+
.loc 1 21 36 // cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py:21:36
|
| 36 |
+
mov.u32 %r7, %tid.x;
|
| 37 |
+
shl.b32 %r8, %r7, 1;
|
| 38 |
+
and.b32 %r9, %r8, 254;
|
| 39 |
+
.loc 1 21 23 // cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py:21:23
|
| 40 |
+
or.b32 %r10, %r9, %r6;
|
| 41 |
+
.loc 1 22 21 // cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py:22:21
|
| 42 |
+
setp.lt.s32 %p1, %r10, 29184;
|
| 43 |
+
.loc 1 24 30 // cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py:24:30
|
| 44 |
+
mul.wide.s32 %rd5, %r10, 4;
|
| 45 |
+
add.s64 %rd1, %rd3, %rd5;
|
| 46 |
+
.loc 1 24 35 // cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py:24:35
|
| 47 |
+
// begin inline asm
|
| 48 |
+
mov.u32 %r1, 0x0;
|
| 49 |
+
mov.u32 %r2, 0x0;
|
| 50 |
+
@%p1 ld.global.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ];
|
| 51 |
+
// end inline asm
|
| 52 |
+
.loc 1 26 18 // cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py:26:18
|
| 53 |
+
mul.f32 %r3, %r1, 0f3F317218;
|
| 54 |
+
mul.f32 %r4, %r2, 0f3F317218;
|
| 55 |
+
.loc 1 27 25 // cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py:27:25
|
| 56 |
+
add.s64 %rd2, %rd4, %rd5;
|
| 57 |
+
.loc 1 27 36 // cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py:27:36
|
| 58 |
+
// begin inline asm
|
| 59 |
+
@%p1 st.global.v2.b32 [ %rd2 + 0 ], { %r3, %r4 };
|
| 60 |
+
// end inline asm
|
| 61 |
+
.loc 1 27 4 // cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py:27:4
|
| 62 |
+
ret;
|
| 63 |
+
$L__tmp1:
|
| 64 |
+
$L__func_end0:
|
| 65 |
+
// -- End function
|
| 66 |
+
}
|
| 67 |
+
.file 1 "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py"
|
| 68 |
+
.section .debug_abbrev
|
| 69 |
+
{
|
| 70 |
+
.b8 1 // Abbreviation Code
|
| 71 |
+
.b8 17 // DW_TAG_compile_unit
|
| 72 |
+
.b8 0 // DW_CHILDREN_no
|
| 73 |
+
.b8 37 // DW_AT_producer
|
| 74 |
+
.b8 8 // DW_FORM_string
|
| 75 |
+
.b8 19 // DW_AT_language
|
| 76 |
+
.b8 5 // DW_FORM_data2
|
| 77 |
+
.b8 3 // DW_AT_name
|
| 78 |
+
.b8 8 // DW_FORM_string
|
| 79 |
+
.b8 16 // DW_AT_stmt_list
|
| 80 |
+
.b8 6 // DW_FORM_data4
|
| 81 |
+
.b8 27 // DW_AT_comp_dir
|
| 82 |
+
.b8 8 // DW_FORM_string
|
| 83 |
+
.b8 0 // EOM(1)
|
| 84 |
+
.b8 0 // EOM(2)
|
| 85 |
+
.b8 0 // EOM(3)
|
| 86 |
+
}
|
| 87 |
+
.section .debug_info
|
| 88 |
+
{
|
| 89 |
+
.b32 139 // Length of Unit
|
| 90 |
+
.b8 2 // DWARF version number
|
| 91 |
+
.b8 0
|
| 92 |
+
.b32 .debug_abbrev // Offset Into Abbrev. Section
|
| 93 |
+
.b8 8 // Address Size (in bytes)
|
| 94 |
+
.b8 1 // Abbrev [1] 0xb:0x84 DW_TAG_compile_unit
|
| 95 |
+
.b8 116 // DW_AT_producer
|
| 96 |
+
.b8 114
|
| 97 |
+
.b8 105
|
| 98 |
+
.b8 116
|
| 99 |
+
.b8 111
|
| 100 |
+
.b8 110
|
| 101 |
+
.b8 0
|
| 102 |
+
.b8 2 // DW_AT_language
|
| 103 |
+
.b8 0
|
| 104 |
+
.b8 99 // DW_AT_name
|
| 105 |
+
.b8 108
|
| 106 |
+
.b8 54
|
| 107 |
+
.b8 120
|
| 108 |
+
.b8 104
|
| 109 |
+
.b8 121
|
| 110 |
+
.b8 120
|
| 111 |
+
.b8 116
|
| 112 |
+
.b8 113
|
| 113 |
+
.b8 116
|
| 114 |
+
.b8 110
|
| 115 |
+
.b8 99
|
| 116 |
+
.b8 122
|
| 117 |
+
.b8 112
|
| 118 |
+
.b8 100
|
| 119 |
+
.b8 108
|
| 120 |
+
.b8 109
|
| 121 |
+
.b8 98
|
| 122 |
+
.b8 51
|
| 123 |
+
.b8 119
|
| 124 |
+
.b8 53
|
| 125 |
+
.b8 111
|
| 126 |
+
.b8 105
|
| 127 |
+
.b8 109
|
| 128 |
+
.b8 105
|
| 129 |
+
.b8 113
|
| 130 |
+
.b8 52
|
| 131 |
+
.b8 103
|
| 132 |
+
.b8 118
|
| 133 |
+
.b8 116
|
| 134 |
+
.b8 97
|
| 135 |
+
.b8 115
|
| 136 |
+
.b8 113
|
| 137 |
+
.b8 106
|
| 138 |
+
.b8 115
|
| 139 |
+
.b8 119
|
| 140 |
+
.b8 98
|
| 141 |
+
.b8 103
|
| 142 |
+
.b8 122
|
| 143 |
+
.b8 52
|
| 144 |
+
.b8 121
|
| 145 |
+
.b8 97
|
| 146 |
+
.b8 107
|
| 147 |
+
.b8 104
|
| 148 |
+
.b8 110
|
| 149 |
+
.b8 104
|
| 150 |
+
.b8 52
|
| 151 |
+
.b8 113
|
| 152 |
+
.b8 107
|
| 153 |
+
.b8 112
|
| 154 |
+
.b8 115
|
| 155 |
+
.b8 55
|
| 156 |
+
.b8 46
|
| 157 |
+
.b8 112
|
| 158 |
+
.b8 121
|
| 159 |
+
.b8 0
|
| 160 |
+
.b32 .debug_line // DW_AT_stmt_list
|
| 161 |
+
.b8 47 // DW_AT_comp_dir
|
| 162 |
+
.b8 119
|
| 163 |
+
.b8 111
|
| 164 |
+
.b8 114
|
| 165 |
+
.b8 107
|
| 166 |
+
.b8 115
|
| 167 |
+
.b8 112
|
| 168 |
+
.b8 97
|
| 169 |
+
.b8 99
|
| 170 |
+
.b8 101
|
| 171 |
+
.b8 47
|
| 172 |
+
.b8 104
|
| 173 |
+
.b8 97
|
| 174 |
+
.b8 110
|
| 175 |
+
.b8 114
|
| 176 |
+
.b8 117
|
| 177 |
+
.b8 105
|
| 178 |
+
.b8 47
|
| 179 |
+
.b8 106
|
| 180 |
+
.b8 117
|
| 181 |
+
.b8 110
|
| 182 |
+
.b8 113
|
| 183 |
+
.b8 117
|
| 184 |
+
.b8 97
|
| 185 |
+
.b8 110
|
| 186 |
+
.b8 47
|
| 187 |
+
.b8 83
|
| 188 |
+
.b8 112
|
| 189 |
+
.b8 101
|
| 190 |
+
.b8 99
|
| 191 |
+
.b8 70
|
| 192 |
+
.b8 111
|
| 193 |
+
.b8 114
|
| 194 |
+
.b8 103
|
| 195 |
+
.b8 101
|
| 196 |
+
.b8 47
|
| 197 |
+
.b8 99
|
| 198 |
+
.b8 97
|
| 199 |
+
.b8 99
|
| 200 |
+
.b8 104
|
| 201 |
+
.b8 101
|
| 202 |
+
.b8 47
|
| 203 |
+
.b8 99
|
| 204 |
+
.b8 111
|
| 205 |
+
.b8 109
|
| 206 |
+
.b8 112
|
| 207 |
+
.b8 105
|
| 208 |
+
.b8 108
|
| 209 |
+
.b8 101
|
| 210 |
+
.b8 100
|
| 211 |
+
.b8 95
|
| 212 |
+
.b8 107
|
| 213 |
+
.b8 101
|
| 214 |
+
.b8 114
|
| 215 |
+
.b8 110
|
| 216 |
+
.b8 101
|
| 217 |
+
.b8 108
|
| 218 |
+
.b8 115
|
| 219 |
+
.b8 47
|
| 220 |
+
.b8 108
|
| 221 |
+
.b8 54
|
| 222 |
+
.b8 0
|
| 223 |
+
}
|
| 224 |
+
.section .debug_macinfo { }
|
progress/SpecForge/cache/compiled_kernels/triton/2/4WLUGGKP4KNR26REGICEZ2CG72LG46AK5KFUG6JJKIDMZRATGGHQ/triton_poi_fused_mul_1.source
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":18:0)
|
| 2 |
+
#loc14 = loc("in_ptr0"(#loc))
|
| 3 |
+
#loc15 = loc("out_ptr0"(#loc))
|
| 4 |
+
#loc16 = loc("xnumel"(#loc))
|
| 5 |
+
module {
|
| 6 |
+
tt.func public @triton_poi_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
|
| 7 |
+
%xnumel_0 = arith.constant 29184 : i32 loc(#loc17)
|
| 8 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc18)
|
| 9 |
+
%xoffset_1 = arith.constant 256 : i32 loc(#loc19)
|
| 10 |
+
%xoffset_2 = arith.constant 256 : i32 loc(#loc19)
|
| 11 |
+
%xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc19)
|
| 12 |
+
%xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc20)
|
| 13 |
+
%xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<256xi32> loc(#loc21)
|
| 14 |
+
%xindex_5 = arith.addi %xindex_4, %xindex : tensor<256xi32> loc(#loc21)
|
| 15 |
+
%xmask = arith.constant dense<29184> : tensor<256xi32> loc(#loc22)
|
| 16 |
+
%xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<256xi32> loc(#loc22)
|
| 17 |
+
%tmp0 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc23)
|
| 18 |
+
%tmp0_7 = tt.addptr %tmp0, %xindex_5 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc23)
|
| 19 |
+
%tmp0_8 = tt.load %tmp0_7, %xmask_6 : tensor<256x!tt.ptr<f32>> loc(#loc24)
|
| 20 |
+
%tmp1 = arith.constant 0.693147182 : f32 loc(#loc25)
|
| 21 |
+
%tmp2 = arith.constant dense<0.693147182> : tensor<256xf32> loc(#loc26)
|
| 22 |
+
%tmp2_9 = arith.mulf %tmp0_8, %tmp2 : tensor<256xf32> loc(#loc26)
|
| 23 |
+
%0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc11)
|
| 24 |
+
%1 = tt.addptr %0, %xindex_5 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc11)
|
| 25 |
+
tt.store %1, %tmp2_9, %xmask_6 : tensor<256x!tt.ptr<f32>> loc(#loc12)
|
| 26 |
+
tt.return loc(#loc13)
|
| 27 |
+
} loc(#loc)
|
| 28 |
+
} loc(#loc)
|
| 29 |
+
#loc1 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":19:13)
|
| 30 |
+
#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":20:28)
|
| 31 |
+
#loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":20:33)
|
| 32 |
+
#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":21:36)
|
| 33 |
+
#loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":21:23)
|
| 34 |
+
#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":22:21)
|
| 35 |
+
#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":24:30)
|
| 36 |
+
#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":24:35)
|
| 37 |
+
#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":25:11)
|
| 38 |
+
#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":26:18)
|
| 39 |
+
#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":27:25)
|
| 40 |
+
#loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":27:36)
|
| 41 |
+
#loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":27:4)
|
| 42 |
+
#loc17 = loc("xnumel"(#loc1))
|
| 43 |
+
#loc18 = loc("xoffset"(#loc2))
|
| 44 |
+
#loc19 = loc("xoffset"(#loc3))
|
| 45 |
+
#loc20 = loc("xindex"(#loc4))
|
| 46 |
+
#loc21 = loc("xindex"(#loc5))
|
| 47 |
+
#loc22 = loc("xmask"(#loc6))
|
| 48 |
+
#loc23 = loc("tmp0"(#loc7))
|
| 49 |
+
#loc24 = loc("tmp0"(#loc8))
|
| 50 |
+
#loc25 = loc("tmp1"(#loc9))
|
| 51 |
+
#loc26 = loc("tmp2"(#loc10))
|
progress/SpecForge/cache/compiled_kernels/triton/2/4WLUGGKP4KNR26REGICEZ2CG72LG46AK5KFUG6JJKIDMZRATGGHQ/triton_poi_fused_mul_1.ttgir
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
|
| 2 |
+
#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":18:0)
|
| 3 |
+
#loc13 = loc("in_ptr0"(#loc))
|
| 4 |
+
#loc14 = loc("out_ptr0"(#loc))
|
| 5 |
+
#loc15 = loc("xnumel"(#loc))
|
| 6 |
+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
|
| 7 |
+
tt.func public @triton_poi_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
|
| 8 |
+
%cst = arith.constant dense<29184> : tensor<256xi32, #blocked> loc(#loc1)
|
| 9 |
+
%c256_i32 = arith.constant 256 : i32 loc(#loc1)
|
| 10 |
+
%cst_0 = arith.constant dense<0.693147182> : tensor<256xf32, #blocked> loc(#loc1)
|
| 11 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc16)
|
| 12 |
+
%xoffset_1 = arith.muli %xoffset, %c256_i32 : i32 loc(#loc17)
|
| 13 |
+
%xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> loc(#loc18)
|
| 14 |
+
%xindex_2 = tt.splat %xoffset_1 : i32 -> tensor<256xi32, #blocked> loc(#loc19)
|
| 15 |
+
%xindex_3 = arith.addi %xindex_2, %xindex : tensor<256xi32, #blocked> loc(#loc19)
|
| 16 |
+
%xmask = arith.cmpi slt, %xindex_3, %cst : tensor<256xi32, #blocked> loc(#loc20)
|
| 17 |
+
%tmp0 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>, #blocked> loc(#loc21)
|
| 18 |
+
%tmp0_4 = tt.addptr %tmp0, %xindex_3 : tensor<256x!tt.ptr<f32>, #blocked>, tensor<256xi32, #blocked> loc(#loc21)
|
| 19 |
+
%tmp0_5 = tt.load %tmp0_4, %xmask : tensor<256x!tt.ptr<f32>, #blocked> loc(#loc22)
|
| 20 |
+
%tmp2 = arith.mulf %tmp0_5, %cst_0 : tensor<256xf32, #blocked> loc(#loc23)
|
| 21 |
+
%0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>, #blocked> loc(#loc10)
|
| 22 |
+
%1 = tt.addptr %0, %xindex_3 : tensor<256x!tt.ptr<f32>, #blocked>, tensor<256xi32, #blocked> loc(#loc10)
|
| 23 |
+
tt.store %1, %tmp2, %xmask : tensor<256x!tt.ptr<f32>, #blocked> loc(#loc11)
|
| 24 |
+
tt.return loc(#loc12)
|
| 25 |
+
} loc(#loc)
|
| 26 |
+
} loc(#loc)
|
| 27 |
+
#loc1 = loc(unknown)
|
| 28 |
+
#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":20:28)
|
| 29 |
+
#loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":20:33)
|
| 30 |
+
#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":21:36)
|
| 31 |
+
#loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":21:23)
|
| 32 |
+
#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":22:21)
|
| 33 |
+
#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":24:30)
|
| 34 |
+
#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":24:35)
|
| 35 |
+
#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":26:18)
|
| 36 |
+
#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":27:25)
|
| 37 |
+
#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":27:36)
|
| 38 |
+
#loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":27:4)
|
| 39 |
+
#loc16 = loc("xoffset"(#loc2))
|
| 40 |
+
#loc17 = loc("xoffset"(#loc3))
|
| 41 |
+
#loc18 = loc("xindex"(#loc4))
|
| 42 |
+
#loc19 = loc("xindex"(#loc5))
|
| 43 |
+
#loc20 = loc("xmask"(#loc6))
|
| 44 |
+
#loc21 = loc("tmp0"(#loc7))
|
| 45 |
+
#loc22 = loc("tmp0"(#loc8))
|
| 46 |
+
#loc23 = loc("tmp2"(#loc9))
|
progress/SpecForge/cache/compiled_kernels/triton/2/4WLUGGKP4KNR26REGICEZ2CG72LG46AK5KFUG6JJKIDMZRATGGHQ/triton_poi_fused_mul_1.ttir
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":18:0)
|
| 2 |
+
#loc13 = loc("in_ptr0"(#loc))
|
| 3 |
+
#loc14 = loc("out_ptr0"(#loc))
|
| 4 |
+
#loc15 = loc("xnumel"(#loc))
|
| 5 |
+
module {
|
| 6 |
+
tt.func public @triton_poi_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
|
| 7 |
+
%tmp2 = arith.constant dense<0.693147182> : tensor<256xf32> loc(#loc16)
|
| 8 |
+
%xmask = arith.constant dense<29184> : tensor<256xi32> loc(#loc17)
|
| 9 |
+
%c256_i32 = arith.constant 256 : i32 loc(#loc3)
|
| 10 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc18)
|
| 11 |
+
%xoffset_0 = arith.muli %xoffset, %c256_i32 : i32 loc(#loc19)
|
| 12 |
+
%xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc20)
|
| 13 |
+
%xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<256xi32> loc(#loc21)
|
| 14 |
+
%xindex_2 = arith.addi %xindex_1, %xindex : tensor<256xi32> loc(#loc21)
|
| 15 |
+
%xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<256xi32> loc(#loc17)
|
| 16 |
+
%tmp0 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc22)
|
| 17 |
+
%tmp0_4 = tt.addptr %tmp0, %xindex_2 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc22)
|
| 18 |
+
%tmp0_5 = tt.load %tmp0_4, %xmask_3 : tensor<256x!tt.ptr<f32>> loc(#loc23)
|
| 19 |
+
%tmp2_6 = arith.mulf %tmp0_5, %tmp2 : tensor<256xf32> loc(#loc16)
|
| 20 |
+
%0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>> loc(#loc10)
|
| 21 |
+
%1 = tt.addptr %0, %xindex_2 : tensor<256x!tt.ptr<f32>>, tensor<256xi32> loc(#loc10)
|
| 22 |
+
tt.store %1, %tmp2_6, %xmask_3 : tensor<256x!tt.ptr<f32>> loc(#loc11)
|
| 23 |
+
tt.return loc(#loc12)
|
| 24 |
+
} loc(#loc)
|
| 25 |
+
} loc(#loc)
|
| 26 |
+
#loc1 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":26:18)
|
| 27 |
+
#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":22:21)
|
| 28 |
+
#loc3 = loc(unknown)
|
| 29 |
+
#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":20:28)
|
| 30 |
+
#loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":20:33)
|
| 31 |
+
#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":21:36)
|
| 32 |
+
#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":21:23)
|
| 33 |
+
#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":24:30)
|
| 34 |
+
#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":24:35)
|
| 35 |
+
#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":27:25)
|
| 36 |
+
#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":27:36)
|
| 37 |
+
#loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/l6/cl6xhyxtqtnczpdlmb3w5oimiq4gvtasqjswbgz4yakhnh4qkps7.py":27:4)
|
| 38 |
+
#loc16 = loc("tmp2"(#loc1))
|
| 39 |
+
#loc17 = loc("xmask"(#loc2))
|
| 40 |
+
#loc18 = loc("xoffset"(#loc4))
|
| 41 |
+
#loc19 = loc("xoffset"(#loc5))
|
| 42 |
+
#loc20 = loc("xindex"(#loc6))
|
| 43 |
+
#loc21 = loc("xindex"(#loc7))
|
| 44 |
+
#loc22 = loc("tmp0"(#loc8))
|
| 45 |
+
#loc23 = loc("tmp0"(#loc9))
|
progress/SpecForge/cache/compiled_kernels/triton/2/55FURAY6LWNBQ2KK76SAMFCTAPMGJOZ6YFWFPHZ7UXAHP4Z4YGJA/__grp__triton_red_fused_mul_0.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"child_paths": {"triton_red_fused_mul_0.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/55FURAY6LWNBQ2KK76SAMFCTAPMGJOZ6YFWFPHZ7UXAHP4Z4YGJA/triton_red_fused_mul_0.source", "triton_red_fused_mul_0.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/55FURAY6LWNBQ2KK76SAMFCTAPMGJOZ6YFWFPHZ7UXAHP4Z4YGJA/triton_red_fused_mul_0.ttir", "triton_red_fused_mul_0.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/55FURAY6LWNBQ2KK76SAMFCTAPMGJOZ6YFWFPHZ7UXAHP4Z4YGJA/triton_red_fused_mul_0.ttgir", "triton_red_fused_mul_0.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/55FURAY6LWNBQ2KK76SAMFCTAPMGJOZ6YFWFPHZ7UXAHP4Z4YGJA/triton_red_fused_mul_0.llir", "triton_red_fused_mul_0.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/55FURAY6LWNBQ2KK76SAMFCTAPMGJOZ6YFWFPHZ7UXAHP4Z4YGJA/triton_red_fused_mul_0.ptx", "triton_red_fused_mul_0.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/55FURAY6LWNBQ2KK76SAMFCTAPMGJOZ6YFWFPHZ7UXAHP4Z4YGJA/triton_red_fused_mul_0.cubin", "triton_red_fused_mul_0.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/55FURAY6LWNBQ2KK76SAMFCTAPMGJOZ6YFWFPHZ7UXAHP4Z4YGJA/triton_red_fused_mul_0.json"}}
|
progress/SpecForge/cache/compiled_kernels/triton/2/55FURAY6LWNBQ2KK76SAMFCTAPMGJOZ6YFWFPHZ7UXAHP4Z4YGJA/triton_red_fused_mul_0.cubin
ADDED
|
Binary file (20.6 kB). View file
|
|
|
progress/SpecForge/cache/compiled_kernels/triton/2/55FURAY6LWNBQ2KK76SAMFCTAPMGJOZ6YFWFPHZ7UXAHP4Z4YGJA/triton_red_fused_mul_0.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"hash": "ef4b48831e5d9a18694affa406145303d864bb3ec16c579f3fa5c077f33cc192", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_mul_0"}
|
progress/SpecForge/cache/compiled_kernels/triton/2/55FURAY6LWNBQ2KK76SAMFCTAPMGJOZ6YFWFPHZ7UXAHP4Z4YGJA/triton_red_fused_mul_0.llir
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
; ModuleID = 'LLVMDialectModule'
|
| 2 |
+
source_filename = "LLVMDialectModule"
|
| 3 |
+
target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
| 4 |
+
|
| 5 |
+
@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
|
| 6 |
+
|
| 7 |
+
; Function Attrs: nounwind
|
| 8 |
+
define ptx_kernel void @triton_red_fused_mul_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 {
|
| 9 |
+
%9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
|
| 10 |
+
%10 = shl i32 %9, 6, !dbg !8
|
| 11 |
+
%11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
|
| 12 |
+
%12 = and i32 %11, 126, !dbg !9
|
| 13 |
+
%13 = lshr exact i32 %12, 1, !dbg !9
|
| 14 |
+
%14 = or disjoint i32 %13, %10, !dbg !10
|
| 15 |
+
%15 = icmp slt i32 %14, 54784, !dbg !11
|
| 16 |
+
%16 = shl nuw nsw i32 %11, 2, !dbg !12
|
| 17 |
+
%17 = and i32 %16, 4, !dbg !12
|
| 18 |
+
%18 = sdiv i32 %14, 1712, !dbg !13
|
| 19 |
+
%19 = shl i32 %14, 7, !dbg !14
|
| 20 |
+
%20 = shl i32 %14, 12
|
| 21 |
+
%21 = mul i32 %18, -7012224
|
| 22 |
+
%22 = add i32 %21, %20
|
| 23 |
+
%23 = zext nneg i32 %17 to i64, !dbg !15
|
| 24 |
+
%24 = sext i32 %19 to i64, !dbg !15
|
| 25 |
+
%invariant.gep = getelementptr bfloat, ptr addrspace(1) %1, i64 %24, !dbg !15
|
| 26 |
+
br label %25, !dbg !15
|
| 27 |
+
|
| 28 |
+
25: ; preds = %8, %25
|
| 29 |
+
%indvars.iv = phi i64 [ 0, %8 ], [ %indvars.iv.next, %25 ]
|
| 30 |
+
%26 = phi float [ 0.000000e+00, %8 ], [ %71, %25 ]
|
| 31 |
+
%27 = phi float [ 0.000000e+00, %8 ], [ %72, %25 ]
|
| 32 |
+
%28 = phi float [ 0.000000e+00, %8 ], [ %73, %25 ]
|
| 33 |
+
%29 = phi float [ 0.000000e+00, %8 ], [ %74, %25 ]
|
| 34 |
+
%30 = or disjoint i64 %indvars.iv, %23, !dbg !16
|
| 35 |
+
%31 = trunc nuw nsw i64 %30 to i32, !dbg !17
|
| 36 |
+
%32 = add i32 %22, %31, !dbg !17
|
| 37 |
+
%33 = sext i32 %32 to i64, !dbg !18
|
| 38 |
+
%34 = getelementptr bfloat, ptr addrspace(1) %0, i64 %33, !dbg !18
|
| 39 |
+
%35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19
|
| 40 |
+
%36 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %34, i64 %35, i1 %15) #4, !dbg !19
|
| 41 |
+
%37 = extractvalue { i32, i32 } %36, 0, !dbg !19
|
| 42 |
+
%38 = bitcast i32 %37 to <2 x bfloat>, !dbg !19
|
| 43 |
+
%39 = extractvalue { i32, i32 } %36, 1, !dbg !19
|
| 44 |
+
%40 = bitcast i32 %39 to <2 x bfloat>, !dbg !19
|
| 45 |
+
%41 = extractelement <2 x bfloat> %38, i64 0, !dbg !19
|
| 46 |
+
%42 = extractelement <2 x bfloat> %38, i64 1, !dbg !19
|
| 47 |
+
%43 = extractelement <2 x bfloat> %40, i64 0, !dbg !19
|
| 48 |
+
%44 = extractelement <2 x bfloat> %40, i64 1, !dbg !19
|
| 49 |
+
%45 = fpext bfloat %41 to float, !dbg !20
|
| 50 |
+
%46 = fpext bfloat %42 to float, !dbg !20
|
| 51 |
+
%47 = fpext bfloat %43 to float, !dbg !20
|
| 52 |
+
%48 = fpext bfloat %44 to float, !dbg !20
|
| 53 |
+
%gep = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %30, !dbg !21
|
| 54 |
+
%49 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !22
|
| 55 |
+
%50 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %gep, i64 %49, i1 %15) #4, !dbg !22
|
| 56 |
+
%51 = extractvalue { i32, i32 } %50, 0, !dbg !22
|
| 57 |
+
%52 = bitcast i32 %51 to <2 x bfloat>, !dbg !22
|
| 58 |
+
%53 = extractvalue { i32, i32 } %50, 1, !dbg !22
|
| 59 |
+
%54 = bitcast i32 %53 to <2 x bfloat>, !dbg !22
|
| 60 |
+
%55 = extractelement <2 x bfloat> %52, i64 0, !dbg !22
|
| 61 |
+
%56 = extractelement <2 x bfloat> %52, i64 1, !dbg !22
|
| 62 |
+
%57 = extractelement <2 x bfloat> %54, i64 0, !dbg !22
|
| 63 |
+
%58 = extractelement <2 x bfloat> %54, i64 1, !dbg !22
|
| 64 |
+
%59 = fpext bfloat %55 to float, !dbg !23
|
| 65 |
+
%60 = fpext bfloat %56 to float, !dbg !23
|
| 66 |
+
%61 = fpext bfloat %57 to float, !dbg !23
|
| 67 |
+
%62 = fpext bfloat %58 to float, !dbg !23
|
| 68 |
+
%63 = fmul float %45, %59, !dbg !24
|
| 69 |
+
%64 = fmul float %46, %60, !dbg !24
|
| 70 |
+
%65 = fmul float %47, %61, !dbg !24
|
| 71 |
+
%66 = fmul float %48, %62, !dbg !24
|
| 72 |
+
%67 = fadd float %26, %63, !dbg !25
|
| 73 |
+
%68 = fadd float %27, %64, !dbg !25
|
| 74 |
+
%69 = fadd float %28, %65, !dbg !25
|
| 75 |
+
%70 = fadd float %29, %66, !dbg !25
|
| 76 |
+
%71 = select i1 %15, float %67, float %26, !dbg !26
|
| 77 |
+
%72 = select i1 %15, float %68, float %27, !dbg !26
|
| 78 |
+
%73 = select i1 %15, float %69, float %28, !dbg !26
|
| 79 |
+
%74 = select i1 %15, float %70, float %29, !dbg !26
|
| 80 |
+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 8, !dbg !15
|
| 81 |
+
%75 = icmp samesign ult i64 %indvars.iv, 120, !dbg !15
|
| 82 |
+
br i1 %75, label %25, label %76, !dbg !15
|
| 83 |
+
|
| 84 |
+
76: ; preds = %25
|
| 85 |
+
%77 = and i32 %11, 63, !dbg !9
|
| 86 |
+
%78 = or disjoint i32 %10, %77, !dbg !10
|
| 87 |
+
%79 = icmp slt i32 %78, 54784, !dbg !11
|
| 88 |
+
%80 = fadd float %71, %72, !dbg !27
|
| 89 |
+
%81 = fadd float %73, %80, !dbg !27
|
| 90 |
+
%82 = fadd float %74, %81, !dbg !27
|
| 91 |
+
%83 = bitcast float %82 to i32, !dbg !31
|
| 92 |
+
%84 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %83, i32 1, i32 31), !dbg !31
|
| 93 |
+
%85 = bitcast i32 %84 to float, !dbg !31
|
| 94 |
+
%86 = fadd float %82, %85, !dbg !27
|
| 95 |
+
%87 = shl nuw nsw i32 %12, 1, !dbg !32
|
| 96 |
+
%88 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %87, !dbg !32
|
| 97 |
+
store float %86, ptr addrspace(3) %88, align 4, !dbg !32
|
| 98 |
+
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !32
|
| 99 |
+
%89 = shl nuw nsw i32 %77, 2, !dbg !32
|
| 100 |
+
%90 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %89, !dbg !32
|
| 101 |
+
%91 = load float, ptr addrspace(3) %90, align 4, !dbg !32
|
| 102 |
+
%92 = sext i32 %78 to i64, !dbg !33
|
| 103 |
+
%93 = getelementptr float, ptr addrspace(1) %2, i64 %92, !dbg !33
|
| 104 |
+
%94 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !34
|
| 105 |
+
%95 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %93, i64 %94, i1 %79) #4, !dbg !34
|
| 106 |
+
%96 = bitcast i32 %95 to float, !dbg !34
|
| 107 |
+
%97 = fmul float %96, 0x3FE62E4300000000, !dbg !35
|
| 108 |
+
%98 = fmul float %97, 0x3FF7154760000000, !dbg !36
|
| 109 |
+
%99 = fsub float %91, %98, !dbg !32
|
| 110 |
+
%100 = getelementptr float, ptr addrspace(1) %3, i64 %92, !dbg !37
|
| 111 |
+
%101 = and i32 %11, 64, !dbg !38
|
| 112 |
+
%102 = icmp eq i32 %101, 0, !dbg !38
|
| 113 |
+
%103 = bitcast float %99 to i32, !dbg !38
|
| 114 |
+
%104 = and i1 %102, %79, !dbg !38
|
| 115 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %103, ptr addrspace(1) %100, i1 %104) #4, !dbg !38
|
| 116 |
+
ret void, !dbg !39
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 120 |
+
declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
|
| 121 |
+
|
| 122 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
| 123 |
+
declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
|
| 124 |
+
|
| 125 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
| 126 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
|
| 127 |
+
|
| 128 |
+
; Function Attrs: convergent nocallback nounwind
|
| 129 |
+
declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
|
| 130 |
+
|
| 131 |
+
attributes #0 = { nounwind "nvvm.reqntid"="128" }
|
| 132 |
+
attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
| 133 |
+
attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
| 134 |
+
attributes #3 = { convergent nocallback nounwind }
|
| 135 |
+
attributes #4 = { nounwind }
|
| 136 |
+
|
| 137 |
+
!llvm.dbg.cu = !{!0}
|
| 138 |
+
!llvm.module.flags = !{!2, !3}
|
| 139 |
+
|
| 140 |
+
!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
|
| 141 |
+
!1 = !DIFile(filename: "ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py", directory: "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3")
|
| 142 |
+
!2 = !{i32 2, !"Debug Info Version", i32 3}
|
| 143 |
+
!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
| 144 |
+
!4 = distinct !DISubprogram(name: "triton_red_fused_mul_0", linkageName: "triton_red_fused_mul_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
|
| 145 |
+
!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
|
| 146 |
+
!6 = !{}
|
| 147 |
+
!7 = !DILocation(line: 23, column: 28, scope: !4)
|
| 148 |
+
!8 = !DILocation(line: 23, column: 33, scope: !4)
|
| 149 |
+
!9 = !DILocation(line: 24, column: 44, scope: !4)
|
| 150 |
+
!10 = !DILocation(line: 24, column: 23, scope: !4)
|
| 151 |
+
!11 = !DILocation(line: 25, column: 21, scope: !4)
|
| 152 |
+
!12 = !DILocation(line: 26, column: 37, scope: !4)
|
| 153 |
+
!13 = !DILocation(line: 29, column: 19, scope: !4)
|
| 154 |
+
!14 = !DILocation(line: 39, column: 45, scope: !4)
|
| 155 |
+
!15 = !DILocation(line: 32, column: 40, scope: !4)
|
| 156 |
+
!16 = !DILocation(line: 33, column: 31, scope: !4)
|
| 157 |
+
!17 = !DILocation(line: 38, column: 50, scope: !4)
|
| 158 |
+
!18 = !DILocation(line: 38, column: 34, scope: !4)
|
| 159 |
+
!19 = !DILocation(line: 38, column: 60, scope: !4)
|
| 160 |
+
!20 = !DILocation(line: 38, column: 122, scope: !4)
|
| 161 |
+
!21 = !DILocation(line: 39, column: 34, scope: !4)
|
| 162 |
+
!22 = !DILocation(line: 39, column: 50, scope: !4)
|
| 163 |
+
!23 = !DILocation(line: 39, column: 112, scope: !4)
|
| 164 |
+
!24 = !DILocation(line: 40, column: 22, scope: !4)
|
| 165 |
+
!25 = !DILocation(line: 42, column: 23, scope: !4)
|
| 166 |
+
!26 = !DILocation(line: 43, column: 48, scope: !4)
|
| 167 |
+
!27 = !DILocation(line: 261, column: 15, scope: !28, inlinedAt: !30)
|
| 168 |
+
!28 = distinct !DILexicalBlockFile(scope: !4, file: !29, discriminator: 0)
|
| 169 |
+
!29 = !DIFile(filename: "standard.py", directory: "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language")
|
| 170 |
+
!30 = !DILocation(line: 44, column: 25, scope: !4)
|
| 171 |
+
!31 = !DILocation(line: 291, column: 36, scope: !28, inlinedAt: !30)
|
| 172 |
+
!32 = !DILocation(line: 51, column: 19, scope: !4)
|
| 173 |
+
!33 = !DILocation(line: 45, column: 30, scope: !4)
|
| 174 |
+
!34 = !DILocation(line: 45, column: 35, scope: !4)
|
| 175 |
+
!35 = !DILocation(line: 48, column: 18, scope: !4)
|
| 176 |
+
!36 = !DILocation(line: 50, column: 19, scope: !4)
|
| 177 |
+
!37 = !DILocation(line: 52, column: 25, scope: !4)
|
| 178 |
+
!38 = !DILocation(line: 52, column: 37, scope: !4)
|
| 179 |
+
!39 = !DILocation(line: 52, column: 4, scope: !4)
|
progress/SpecForge/cache/compiled_kernels/triton/2/55FURAY6LWNBQ2KK76SAMFCTAPMGJOZ6YFWFPHZ7UXAHP4Z4YGJA/triton_red_fused_mul_0.ptx
ADDED
|
@@ -0,0 +1,424 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// Generated by LLVM NVPTX Back-End
|
| 3 |
+
//
|
| 4 |
+
|
| 5 |
+
.version 8.7
|
| 6 |
+
.target sm_90a
|
| 7 |
+
.address_size 64
|
| 8 |
+
|
| 9 |
+
// .globl triton_red_fused_mul_0 // -- Begin function triton_red_fused_mul_0
|
| 10 |
+
.extern .shared .align 16 .b8 global_smem[];
|
| 11 |
+
// @triton_red_fused_mul_0
|
| 12 |
+
.visible .entry triton_red_fused_mul_0(
|
| 13 |
+
.param .u64 .ptr .global .align 1 triton_red_fused_mul_0_param_0,
|
| 14 |
+
.param .u64 .ptr .global .align 1 triton_red_fused_mul_0_param_1,
|
| 15 |
+
.param .u64 .ptr .global .align 1 triton_red_fused_mul_0_param_2,
|
| 16 |
+
.param .u64 .ptr .global .align 1 triton_red_fused_mul_0_param_3,
|
| 17 |
+
.param .u32 triton_red_fused_mul_0_param_4,
|
| 18 |
+
.param .u32 triton_red_fused_mul_0_param_5,
|
| 19 |
+
.param .u64 .ptr .global .align 1 triton_red_fused_mul_0_param_6,
|
| 20 |
+
.param .u64 .ptr .global .align 1 triton_red_fused_mul_0_param_7
|
| 21 |
+
)
|
| 22 |
+
.reqntid 128
|
| 23 |
+
{
|
| 24 |
+
.reg .pred %p<7>;
|
| 25 |
+
.reg .b16 %rs<9>;
|
| 26 |
+
.reg .b32 %r<75>;
|
| 27 |
+
.reg .b64 %rd<29>;
|
| 28 |
+
.loc 1 18 0 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:18:0
|
| 29 |
+
$L__func_begin0:
|
| 30 |
+
.loc 1 18 0 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:18:0
|
| 31 |
+
|
| 32 |
+
// %bb.0:
|
| 33 |
+
ld.param.b64 %rd9, [triton_red_fused_mul_0_param_3];
|
| 34 |
+
ld.param.b64 %rd8, [triton_red_fused_mul_0_param_2];
|
| 35 |
+
ld.param.b64 %rd7, [triton_red_fused_mul_0_param_0];
|
| 36 |
+
ld.param.b64 %rd11, [triton_red_fused_mul_0_param_1];
|
| 37 |
+
$L__tmp0:
|
| 38 |
+
.loc 1 23 28 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:23:28
|
| 39 |
+
mov.u32 %r14, %ctaid.x;
|
| 40 |
+
.loc 1 23 33 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:23:33
|
| 41 |
+
shl.b32 %r1, %r14, 6;
|
| 42 |
+
.loc 1 24 44 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:24:44
|
| 43 |
+
mov.u32 %r2, %tid.x;
|
| 44 |
+
and.b32 %r3, %r2, 126;
|
| 45 |
+
bfe.u32 %r15, %r2, 1, 6;
|
| 46 |
+
.loc 1 24 23 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:24:23
|
| 47 |
+
or.b32 %r4, %r15, %r1;
|
| 48 |
+
.loc 1 26 37 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:26:37
|
| 49 |
+
shl.b32 %r16, %r2, 2;
|
| 50 |
+
and.b32 %r17, %r16, 4;
|
| 51 |
+
.loc 1 29 19 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:29:19
|
| 52 |
+
mul.hi.s32 %r18, %r4, 1284476201;
|
| 53 |
+
shr.u32 %r19, %r18, 31;
|
| 54 |
+
shr.s32 %r20, %r18, 9;
|
| 55 |
+
add.s32 %r21, %r20, %r19;
|
| 56 |
+
.loc 1 32 40 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:32:40
|
| 57 |
+
and.b32 %r22, %r2, 1;
|
| 58 |
+
mul.wide.u32 %rd12, %r22, 8;
|
| 59 |
+
shl.b32 %r23, %r14, 13;
|
| 60 |
+
shl.b32 %r24, %r15, 7;
|
| 61 |
+
or.b32 %r25, %r23, %r24;
|
| 62 |
+
mul.wide.s32 %rd13, %r25, 2;
|
| 63 |
+
or.b64 %rd14, %rd12, %rd13;
|
| 64 |
+
add.s64 %rd27, %rd11, %rd14;
|
| 65 |
+
shl.b32 %r26, %r14, 18;
|
| 66 |
+
shl.b32 %r27, %r15, 12;
|
| 67 |
+
or.b32 %r28, %r26, %r27;
|
| 68 |
+
or.b32 %r29, %r28, %r17;
|
| 69 |
+
mul.lo.s32 %r30, %r21, 7012224;
|
| 70 |
+
sub.s32 %r31, %r29, %r30;
|
| 71 |
+
cvt.u64.u32 %rd2, %r31;
|
| 72 |
+
mov.b32 %r71, 0f00000000;
|
| 73 |
+
mov.b64 %rd28, -8;
|
| 74 |
+
setp.lt.s32 %p1, %r4, 54784;
|
| 75 |
+
mov.b32 %r72, %r71;
|
| 76 |
+
mov.b32 %r73, %r71;
|
| 77 |
+
mov.b32 %r74, %r71;
|
| 78 |
+
$L__BB0_1: // =>This Inner Loop Header: Depth=1
|
| 79 |
+
.loc 1 38 34 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:38:34
|
| 80 |
+
add.s64 %rd21, %rd2, %rd28;
|
| 81 |
+
cvt.u32.u64 %r40, %rd21;
|
| 82 |
+
add.s32 %r41, %r40, 8;
|
| 83 |
+
mad.wide.s32 %rd16, %r41, 2, %rd7;
|
| 84 |
+
.loc 1 38 60 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:38:60
|
| 85 |
+
// begin inline asm
|
| 86 |
+
mov.u64 %rd15, 0x0;
|
| 87 |
+
createpolicy.fractional.L2::evict_first.b64 %rd15, 1.0;
|
| 88 |
+
// end inline asm
|
| 89 |
+
mov.b32 %r34, 0;
|
| 90 |
+
// begin inline asm
|
| 91 |
+
mov.u32 %r32, %r34;
|
| 92 |
+
mov.u32 %r33, %r34;
|
| 93 |
+
@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r32, %r33 }, [ %rd16 + 0 ], %rd15;
|
| 94 |
+
// end inline asm
|
| 95 |
+
mov.b32 {%rs1, %rs2}, %r32;
|
| 96 |
+
mov.b32 {%rs3, %rs4}, %r33;
|
| 97 |
+
.loc 1 38 122 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:38:122
|
| 98 |
+
cvt.f32.bf16 %r42, %rs1;
|
| 99 |
+
cvt.f32.bf16 %r43, %rs2;
|
| 100 |
+
cvt.f32.bf16 %r44, %rs3;
|
| 101 |
+
cvt.f32.bf16 %r45, %rs4;
|
| 102 |
+
.loc 1 39 50 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:39:50
|
| 103 |
+
// begin inline asm
|
| 104 |
+
mov.u64 %rd18, 0x0;
|
| 105 |
+
createpolicy.fractional.L2::evict_first.b64 %rd18, 1.0;
|
| 106 |
+
// end inline asm
|
| 107 |
+
// begin inline asm
|
| 108 |
+
mov.u32 %r36, %r34;
|
| 109 |
+
mov.u32 %r37, %r34;
|
| 110 |
+
@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r36, %r37 }, [ %rd27 + 0 ], %rd18;
|
| 111 |
+
// end inline asm
|
| 112 |
+
mov.b32 {%rs5, %rs6}, %r36;
|
| 113 |
+
mov.b32 {%rs7, %rs8}, %r37;
|
| 114 |
+
.loc 1 39 112 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:39:112
|
| 115 |
+
cvt.f32.bf16 %r46, %rs5;
|
| 116 |
+
cvt.f32.bf16 %r47, %rs6;
|
| 117 |
+
cvt.f32.bf16 %r48, %rs7;
|
| 118 |
+
cvt.f32.bf16 %r49, %rs8;
|
| 119 |
+
.loc 1 42 23 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:42:23
|
| 120 |
+
fma.rn.f32 %r50, %r42, %r46, %r71;
|
| 121 |
+
fma.rn.f32 %r51, %r43, %r47, %r72;
|
| 122 |
+
fma.rn.f32 %r52, %r44, %r48, %r73;
|
| 123 |
+
fma.rn.f32 %r53, %r45, %r49, %r74;
|
| 124 |
+
.loc 1 43 48 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:43:48
|
| 125 |
+
selp.f32 %r71, %r50, %r71, %p1;
|
| 126 |
+
selp.f32 %r72, %r51, %r72, %p1;
|
| 127 |
+
selp.f32 %r73, %r52, %r73, %p1;
|
| 128 |
+
selp.f32 %r74, %r53, %r74, %p1;
|
| 129 |
+
.loc 1 32 40 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:32:40
|
| 130 |
+
add.s64 %rd28, %rd28, 8;
|
| 131 |
+
add.s64 %rd27, %rd27, 16;
|
| 132 |
+
setp.lt.u64 %p3, %rd28, 120;
|
| 133 |
+
@%p3 bra $L__BB0_1;
|
| 134 |
+
// %bb.2:
|
| 135 |
+
.loc 1 24 44 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:24:44
|
| 136 |
+
and.b32 %r56, %r2, 63;
|
| 137 |
+
.loc 1 24 23 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:24:23
|
| 138 |
+
or.b32 %r57, %r1, %r56;
|
| 139 |
+
.loc 1 25 21 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:25:21
|
| 140 |
+
setp.lt.s32 %p4, %r57, 54784;
|
| 141 |
+
$L__tmp1:
|
| 142 |
+
.loc 2 261 15 // standard.py:261:15 @[ ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:44:25 ]
|
| 143 |
+
add.f32 %r58, %r71, %r72;
|
| 144 |
+
add.f32 %r59, %r73, %r58;
|
| 145 |
+
add.f32 %r60, %r74, %r59;
|
| 146 |
+
.loc 2 291 36 // standard.py:291:36 @[ ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:44:25 ]
|
| 147 |
+
shfl.sync.bfly.b32 %r61, %r60, 1, 31, -1;
|
| 148 |
+
.loc 2 261 15 // standard.py:261:15 @[ ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:44:25 ]
|
| 149 |
+
add.f32 %r62, %r60, %r61;
|
| 150 |
+
$L__tmp2:
|
| 151 |
+
.loc 1 51 19 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:51:19
|
| 152 |
+
shl.b32 %r63, %r3, 1;
|
| 153 |
+
mov.b32 %r64, global_smem;
|
| 154 |
+
add.s32 %r65, %r64, %r63;
|
| 155 |
+
st.shared.b32 [%r65], %r62;
|
| 156 |
+
bar.sync 0;
|
| 157 |
+
shl.b32 %r66, %r56, 2;
|
| 158 |
+
add.s32 %r67, %r64, %r66;
|
| 159 |
+
ld.shared.b32 %r68, [%r67];
|
| 160 |
+
.loc 1 45 30 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:45:30
|
| 161 |
+
mul.wide.s32 %rd26, %r57, 4;
|
| 162 |
+
add.s64 %rd23, %rd8, %rd26;
|
| 163 |
+
.loc 1 45 35 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:45:35
|
| 164 |
+
// begin inline asm
|
| 165 |
+
mov.u64 %rd24, 0x0;
|
| 166 |
+
createpolicy.fractional.L2::evict_last.b64 %rd24, 1.0;
|
| 167 |
+
// end inline asm
|
| 168 |
+
// begin inline asm
|
| 169 |
+
mov.u32 %r54, 0x0;
|
| 170 |
+
@%p4 ld.global.L1::evict_last.L2::cache_hint.b32 { %r54 }, [ %rd23 + 0 ], %rd24;
|
| 171 |
+
// end inline asm
|
| 172 |
+
.loc 1 48 18 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:48:18
|
| 173 |
+
mul.f32 %r69, %r54, 0fBF317218;
|
| 174 |
+
.loc 1 51 19 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:51:19
|
| 175 |
+
fma.rn.f32 %r55, %r69, 0f3FB8AA3B, %r68;
|
| 176 |
+
.loc 1 52 25 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:52:25
|
| 177 |
+
add.s64 %rd25, %rd9, %rd26;
|
| 178 |
+
.loc 1 52 37 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:52:37
|
| 179 |
+
and.b32 %r70, %r2, 64;
|
| 180 |
+
setp.eq.b32 %p6, %r70, 0;
|
| 181 |
+
and.pred %p5, %p6, %p4;
|
| 182 |
+
// begin inline asm
|
| 183 |
+
@%p5 st.global.b32 [ %rd25 + 0 ], { %r55 };
|
| 184 |
+
// end inline asm
|
| 185 |
+
.loc 1 52 4 // ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py:52:4
|
| 186 |
+
ret;
|
| 187 |
+
$L__tmp3:
|
| 188 |
+
$L__func_end0:
|
| 189 |
+
// -- End function
|
| 190 |
+
}
|
| 191 |
+
.file 1 "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py"
|
| 192 |
+
.file 2 "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py"
|
| 193 |
+
.section .debug_abbrev
|
| 194 |
+
{
|
| 195 |
+
.b8 1 // Abbreviation Code
|
| 196 |
+
.b8 17 // DW_TAG_compile_unit
|
| 197 |
+
.b8 1 // DW_CHILDREN_yes
|
| 198 |
+
.b8 37 // DW_AT_producer
|
| 199 |
+
.b8 8 // DW_FORM_string
|
| 200 |
+
.b8 19 // DW_AT_language
|
| 201 |
+
.b8 5 // DW_FORM_data2
|
| 202 |
+
.b8 3 // DW_AT_name
|
| 203 |
+
.b8 8 // DW_FORM_string
|
| 204 |
+
.b8 16 // DW_AT_stmt_list
|
| 205 |
+
.b8 6 // DW_FORM_data4
|
| 206 |
+
.b8 27 // DW_AT_comp_dir
|
| 207 |
+
.b8 8 // DW_FORM_string
|
| 208 |
+
.b8 0 // EOM(1)
|
| 209 |
+
.b8 0 // EOM(2)
|
| 210 |
+
.b8 2 // Abbreviation Code
|
| 211 |
+
.b8 46 // DW_TAG_subprogram
|
| 212 |
+
.b8 0 // DW_CHILDREN_no
|
| 213 |
+
.b8 3 // DW_AT_name
|
| 214 |
+
.b8 8 // DW_FORM_string
|
| 215 |
+
.b8 32 // DW_AT_inline
|
| 216 |
+
.b8 11 // DW_FORM_data1
|
| 217 |
+
.b8 0 // EOM(1)
|
| 218 |
+
.b8 0 // EOM(2)
|
| 219 |
+
.b8 3 // Abbreviation Code
|
| 220 |
+
.b8 46 // DW_TAG_subprogram
|
| 221 |
+
.b8 1 // DW_CHILDREN_yes
|
| 222 |
+
.b8 17 // DW_AT_low_pc
|
| 223 |
+
.b8 1 // DW_FORM_addr
|
| 224 |
+
.b8 18 // DW_AT_high_pc
|
| 225 |
+
.b8 1 // DW_FORM_addr
|
| 226 |
+
.b8 49 // DW_AT_abstract_origin
|
| 227 |
+
.b8 19 // DW_FORM_ref4
|
| 228 |
+
.b8 0 // EOM(1)
|
| 229 |
+
.b8 0 // EOM(2)
|
| 230 |
+
.b8 4 // Abbreviation Code
|
| 231 |
+
.b8 29 // DW_TAG_inlined_subroutine
|
| 232 |
+
.b8 0 // DW_CHILDREN_no
|
| 233 |
+
.b8 49 // DW_AT_abstract_origin
|
| 234 |
+
.b8 19 // DW_FORM_ref4
|
| 235 |
+
.b8 17 // DW_AT_low_pc
|
| 236 |
+
.b8 1 // DW_FORM_addr
|
| 237 |
+
.b8 18 // DW_AT_high_pc
|
| 238 |
+
.b8 1 // DW_FORM_addr
|
| 239 |
+
.b8 88 // DW_AT_call_file
|
| 240 |
+
.b8 11 // DW_FORM_data1
|
| 241 |
+
.b8 89 // DW_AT_call_line
|
| 242 |
+
.b8 11 // DW_FORM_data1
|
| 243 |
+
.b8 87 // DW_AT_call_column
|
| 244 |
+
.b8 11 // DW_FORM_data1
|
| 245 |
+
.b8 0 // EOM(1)
|
| 246 |
+
.b8 0 // EOM(2)
|
| 247 |
+
.b8 0 // EOM(3)
|
| 248 |
+
}
|
| 249 |
+
.section .debug_info
|
| 250 |
+
{
|
| 251 |
+
.b32 211 // Length of Unit
|
| 252 |
+
.b8 2 // DWARF version number
|
| 253 |
+
.b8 0
|
| 254 |
+
.b32 .debug_abbrev // Offset Into Abbrev. Section
|
| 255 |
+
.b8 8 // Address Size (in bytes)
|
| 256 |
+
.b8 1 // Abbrev [1] 0xb:0xcc DW_TAG_compile_unit
|
| 257 |
+
.b8 116 // DW_AT_producer
|
| 258 |
+
.b8 114
|
| 259 |
+
.b8 105
|
| 260 |
+
.b8 116
|
| 261 |
+
.b8 111
|
| 262 |
+
.b8 110
|
| 263 |
+
.b8 0
|
| 264 |
+
.b8 2 // DW_AT_language
|
| 265 |
+
.b8 0
|
| 266 |
+
.b8 99 // DW_AT_name
|
| 267 |
+
.b8 116
|
| 268 |
+
.b8 51
|
| 269 |
+
.b8 105
|
| 270 |
+
.b8 100
|
| 271 |
+
.b8 120
|
| 272 |
+
.b8 101
|
| 273 |
+
.b8 99
|
| 274 |
+
.b8 55
|
| 275 |
+
.b8 54
|
| 276 |
+
.b8 113
|
| 277 |
+
.b8 116
|
| 278 |
+
.b8 108
|
| 279 |
+
.b8 51
|
| 280 |
+
.b8 118
|
| 281 |
+
.b8 115
|
| 282 |
+
.b8 104
|
| 283 |
+
.b8 118
|
| 284 |
+
.b8 118
|
| 285 |
+
.b8 119
|
| 286 |
+
.b8 110
|
| 287 |
+
.b8 121
|
| 288 |
+
.b8 117
|
| 289 |
+
.b8 54
|
| 290 |
+
.b8 114
|
| 291 |
+
.b8 101
|
| 292 |
+
.b8 98
|
| 293 |
+
.b8 107
|
| 294 |
+
.b8 98
|
| 295 |
+
.b8 111
|
| 296 |
+
.b8 107
|
| 297 |
+
.b8 54
|
| 298 |
+
.b8 122
|
| 299 |
+
.b8 53
|
| 300 |
+
.b8 115
|
| 301 |
+
.b8 111
|
| 302 |
+
.b8 103
|
| 303 |
+
.b8 116
|
| 304 |
+
.b8 118
|
| 305 |
+
.b8 108
|
| 306 |
+
.b8 122
|
| 307 |
+
.b8 54
|
| 308 |
+
.b8 97
|
| 309 |
+
.b8 54
|
| 310 |
+
.b8 50
|
| 311 |
+
.b8 115
|
| 312 |
+
.b8 120
|
| 313 |
+
.b8 101
|
| 314 |
+
.b8 100
|
| 315 |
+
.b8 117
|
| 316 |
+
.b8 113
|
| 317 |
+
.b8 112
|
| 318 |
+
.b8 46
|
| 319 |
+
.b8 112
|
| 320 |
+
.b8 121
|
| 321 |
+
.b8 0
|
| 322 |
+
.b32 .debug_line // DW_AT_stmt_list
|
| 323 |
+
.b8 47 // DW_AT_comp_dir
|
| 324 |
+
.b8 119
|
| 325 |
+
.b8 111
|
| 326 |
+
.b8 114
|
| 327 |
+
.b8 107
|
| 328 |
+
.b8 115
|
| 329 |
+
.b8 112
|
| 330 |
+
.b8 97
|
| 331 |
+
.b8 99
|
| 332 |
+
.b8 101
|
| 333 |
+
.b8 47
|
| 334 |
+
.b8 104
|
| 335 |
+
.b8 97
|
| 336 |
+
.b8 110
|
| 337 |
+
.b8 114
|
| 338 |
+
.b8 117
|
| 339 |
+
.b8 105
|
| 340 |
+
.b8 47
|
| 341 |
+
.b8 106
|
| 342 |
+
.b8 117
|
| 343 |
+
.b8 110
|
| 344 |
+
.b8 113
|
| 345 |
+
.b8 117
|
| 346 |
+
.b8 97
|
| 347 |
+
.b8 110
|
| 348 |
+
.b8 47
|
| 349 |
+
.b8 83
|
| 350 |
+
.b8 112
|
| 351 |
+
.b8 101
|
| 352 |
+
.b8 99
|
| 353 |
+
.b8 70
|
| 354 |
+
.b8 111
|
| 355 |
+
.b8 114
|
| 356 |
+
.b8 103
|
| 357 |
+
.b8 101
|
| 358 |
+
.b8 47
|
| 359 |
+
.b8 99
|
| 360 |
+
.b8 97
|
| 361 |
+
.b8 99
|
| 362 |
+
.b8 104
|
| 363 |
+
.b8 101
|
| 364 |
+
.b8 47
|
| 365 |
+
.b8 99
|
| 366 |
+
.b8 111
|
| 367 |
+
.b8 109
|
| 368 |
+
.b8 112
|
| 369 |
+
.b8 105
|
| 370 |
+
.b8 108
|
| 371 |
+
.b8 101
|
| 372 |
+
.b8 100
|
| 373 |
+
.b8 95
|
| 374 |
+
.b8 107
|
| 375 |
+
.b8 101
|
| 376 |
+
.b8 114
|
| 377 |
+
.b8 110
|
| 378 |
+
.b8 101
|
| 379 |
+
.b8 108
|
| 380 |
+
.b8 115
|
| 381 |
+
.b8 47
|
| 382 |
+
.b8 116
|
| 383 |
+
.b8 51
|
| 384 |
+
.b8 0
|
| 385 |
+
.b8 2 // Abbrev [2] 0x8f:0x19 DW_TAG_subprogram
|
| 386 |
+
.b8 116 // DW_AT_name
|
| 387 |
+
.b8 114
|
| 388 |
+
.b8 105
|
| 389 |
+
.b8 116
|
| 390 |
+
.b8 111
|
| 391 |
+
.b8 110
|
| 392 |
+
.b8 95
|
| 393 |
+
.b8 114
|
| 394 |
+
.b8 101
|
| 395 |
+
.b8 100
|
| 396 |
+
.b8 95
|
| 397 |
+
.b8 102
|
| 398 |
+
.b8 117
|
| 399 |
+
.b8 115
|
| 400 |
+
.b8 101
|
| 401 |
+
.b8 100
|
| 402 |
+
.b8 95
|
| 403 |
+
.b8 109
|
| 404 |
+
.b8 117
|
| 405 |
+
.b8 108
|
| 406 |
+
.b8 95
|
| 407 |
+
.b8 48
|
| 408 |
+
.b8 0
|
| 409 |
+
.b8 1 // DW_AT_inline
|
| 410 |
+
.b8 3 // Abbrev [3] 0xa8:0x2e DW_TAG_subprogram
|
| 411 |
+
.b64 $L__func_begin0 // DW_AT_low_pc
|
| 412 |
+
.b64 $L__func_end0 // DW_AT_high_pc
|
| 413 |
+
.b32 143 // DW_AT_abstract_origin
|
| 414 |
+
.b8 4 // Abbrev [4] 0xbd:0x18 DW_TAG_inlined_subroutine
|
| 415 |
+
.b32 143 // DW_AT_abstract_origin
|
| 416 |
+
.b64 $L__tmp1 // DW_AT_low_pc
|
| 417 |
+
.b64 $L__tmp2 // DW_AT_high_pc
|
| 418 |
+
.b8 1 // DW_AT_call_file
|
| 419 |
+
.b8 44 // DW_AT_call_line
|
| 420 |
+
.b8 25 // DW_AT_call_column
|
| 421 |
+
.b8 0 // End Of Children Mark
|
| 422 |
+
.b8 0 // End Of Children Mark
|
| 423 |
+
}
|
| 424 |
+
.section .debug_macinfo { }
|
progress/SpecForge/cache/compiled_kernels/triton/2/55FURAY6LWNBQ2KK76SAMFCTAPMGJOZ6YFWFPHZ7UXAHP4Z4YGJA/triton_red_fused_mul_0.source
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":18:0)
|
| 2 |
+
#loc48 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0)
|
| 3 |
+
#loc50 = loc(unknown)
|
| 4 |
+
#loc53 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0)
|
| 5 |
+
#loc57 = loc("in_ptr0"(#loc))
|
| 6 |
+
#loc58 = loc("in_ptr1"(#loc))
|
| 7 |
+
#loc59 = loc("in_ptr2"(#loc))
|
| 8 |
+
#loc60 = loc("out_ptr1"(#loc))
|
| 9 |
+
#loc61 = loc("xnumel"(#loc))
|
| 10 |
+
#loc62 = loc("r0_numel"(#loc))
|
| 11 |
+
#loc106 = loc("input"(#loc48))
|
| 12 |
+
#loc107 = loc("a"(#loc53))
|
| 13 |
+
#loc108 = loc("b"(#loc53))
|
| 14 |
+
module {
|
| 15 |
+
tt.func public @triton_red_fused_mul_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 16 |
+
%xnumel_0 = arith.constant 54784 : i32 loc(#loc63)
|
| 17 |
+
%r0_numel_1 = arith.constant 128 : i32 loc(#loc64)
|
| 18 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc65)
|
| 19 |
+
%xoffset_2 = arith.constant 64 : i32 loc(#loc66)
|
| 20 |
+
%xoffset_3 = arith.constant 64 : i32 loc(#loc66)
|
| 21 |
+
%xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc66)
|
| 22 |
+
%xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc67)
|
| 23 |
+
%xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc68)
|
| 24 |
+
%xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc69)
|
| 25 |
+
%xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc69)
|
| 26 |
+
%xmask = arith.constant dense<54784> : tensor<64x1xi32> loc(#loc70)
|
| 27 |
+
%xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<64x1xi32> loc(#loc70)
|
| 28 |
+
%r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc71)
|
| 29 |
+
%r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32> -> tensor<1x8xi32> loc(#loc72)
|
| 30 |
+
%x0 = arith.constant 1712 : i32 loc(#loc73)
|
| 31 |
+
%x0_10 = arith.constant 1712 : i32 loc(#loc73)
|
| 32 |
+
%x0_11 = arith.constant dense<1712> : tensor<64x1xi32> loc(#loc73)
|
| 33 |
+
%x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc73)
|
| 34 |
+
%x1 = arith.constant 1712 : i32 loc(#loc74)
|
| 35 |
+
%x1_13 = arith.constant 1712 : i32 loc(#loc74)
|
| 36 |
+
%x1_14 = arith.constant dense<1712> : tensor<64x1xi32> loc(#loc74)
|
| 37 |
+
%x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc74)
|
| 38 |
+
%_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc75)
|
| 39 |
+
%_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc75)
|
| 40 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc14)
|
| 41 |
+
%c8_i32 = arith.constant 8 : i32 loc(#loc14)
|
| 42 |
+
%0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
|
| 43 |
+
%1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
|
| 44 |
+
%2 = arith.bitcast %c8_i32 : i32 to i32 loc(#loc14)
|
| 45 |
+
%3 = ub.poison : i32 loc(#loc14)
|
| 46 |
+
%_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_16) -> (tensor<64x8xf32>) : i32 {
|
| 47 |
+
%r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32> loc(#loc77)
|
| 48 |
+
%r0_index_24 = arith.addi %r0_index, %r0_base_9 : tensor<1x8xi32> loc(#loc77)
|
| 49 |
+
%r0_mask = arith.constant dense<128> : tensor<1x8xi32> loc(#loc78)
|
| 50 |
+
%r0_mask_25 = arith.cmpi slt, %r0_index_24, %r0_mask : tensor<1x8xi32> loc(#loc78)
|
| 51 |
+
%tmp0 = arith.constant 128 : i32 loc(#loc79)
|
| 52 |
+
%tmp0_26 = arith.constant 128 : i32 loc(#loc79)
|
| 53 |
+
%tmp0_27 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc79)
|
| 54 |
+
%tmp0_28 = arith.muli %tmp0_27, %x1_15 : tensor<64x1xi32> loc(#loc79)
|
| 55 |
+
%tmp0_29 = tt.broadcast %r0_index_24 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc80)
|
| 56 |
+
%tmp0_30 = tt.broadcast %tmp0_28 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc80)
|
| 57 |
+
%tmp0_31 = arith.addi %tmp0_29, %tmp0_30 : tensor<64x8xi32> loc(#loc80)
|
| 58 |
+
%tmp0_32 = arith.constant 4096 : i32 loc(#loc81)
|
| 59 |
+
%tmp0_33 = arith.constant 4096 : i32 loc(#loc81)
|
| 60 |
+
%tmp0_34 = arith.constant dense<4096> : tensor<64x1xi32> loc(#loc81)
|
| 61 |
+
%tmp0_35 = arith.muli %tmp0_34, %x0_12 : tensor<64x1xi32> loc(#loc81)
|
| 62 |
+
%tmp0_36 = tt.broadcast %tmp0_35 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc82)
|
| 63 |
+
%tmp0_37 = arith.addi %tmp0_31, %tmp0_36 : tensor<64x8xi32> loc(#loc82)
|
| 64 |
+
%tmp0_38 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc83)
|
| 65 |
+
%tmp0_39 = tt.addptr %tmp0_38, %tmp0_37 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc83)
|
| 66 |
+
%tmp0_40 = tt.broadcast %r0_mask_25 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc84)
|
| 67 |
+
%tmp0_41 = tt.broadcast %xmask_8 : tensor<64x1xi1> -> tensor<64x8xi1> loc(#loc84)
|
| 68 |
+
%tmp0_42 = arith.andi %tmp0_40, %tmp0_41 : tensor<64x8xi1> loc(#loc84)
|
| 69 |
+
%tmp0_43 = arith.constant 0.000000e+00 : f32 loc(#loc85)
|
| 70 |
+
%tmp0_44 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc85)
|
| 71 |
+
%tmp0_45 = arith.truncf %tmp0_44 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc85)
|
| 72 |
+
%tmp0_46 = tt.load %tmp0_39, %tmp0_42, %tmp0_45 evictionPolicy = evict_first : tensor<64x8x!tt.ptr<bf16>> loc(#loc85)
|
| 73 |
+
%tmp0_47 = arith.extf %tmp0_46 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc86)
|
| 74 |
+
%tmp1 = arith.constant 128 : i32 loc(#loc87)
|
| 75 |
+
%tmp1_48 = arith.constant 128 : i32 loc(#loc87)
|
| 76 |
+
%tmp1_49 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc87)
|
| 77 |
+
%tmp1_50 = arith.muli %tmp1_49, %xindex_7 : tensor<64x1xi32> loc(#loc87)
|
| 78 |
+
%tmp1_51 = tt.broadcast %r0_index_24 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc88)
|
| 79 |
+
%tmp1_52 = tt.broadcast %tmp1_50 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc88)
|
| 80 |
+
%tmp1_53 = arith.addi %tmp1_51, %tmp1_52 : tensor<64x8xi32> loc(#loc88)
|
| 81 |
+
%tmp1_54 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc89)
|
| 82 |
+
%tmp1_55 = tt.addptr %tmp1_54, %tmp1_53 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc89)
|
| 83 |
+
%tmp1_56 = tt.broadcast %r0_mask_25 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc90)
|
| 84 |
+
%tmp1_57 = tt.broadcast %xmask_8 : tensor<64x1xi1> -> tensor<64x8xi1> loc(#loc90)
|
| 85 |
+
%tmp1_58 = arith.andi %tmp1_56, %tmp1_57 : tensor<64x8xi1> loc(#loc90)
|
| 86 |
+
%tmp1_59 = arith.constant 0.000000e+00 : f32 loc(#loc91)
|
| 87 |
+
%tmp1_60 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc91)
|
| 88 |
+
%tmp1_61 = arith.truncf %tmp1_60 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc91)
|
| 89 |
+
%tmp1_62 = tt.load %tmp1_55, %tmp1_58, %tmp1_61 evictionPolicy = evict_first : tensor<64x8x!tt.ptr<bf16>> loc(#loc91)
|
| 90 |
+
%tmp1_63 = arith.extf %tmp1_62 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc92)
|
| 91 |
+
%tmp2 = arith.mulf %tmp0_47, %tmp1_63 : tensor<64x8xf32> loc(#loc93)
|
| 92 |
+
%tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<64x8xf32> loc(#loc94)
|
| 93 |
+
%_tmp4_64 = tt.broadcast %r0_mask_25 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc95)
|
| 94 |
+
%_tmp4_65 = tt.broadcast %xmask_8 : tensor<64x1xi1> -> tensor<64x8xi1> loc(#loc95)
|
| 95 |
+
%_tmp4_66 = arith.andi %_tmp4_64, %_tmp4_65 : tensor<64x8xi1> loc(#loc95)
|
| 96 |
+
%_tmp4_67 = arith.select %_tmp4_66, %tmp5, %_tmp4_23 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc96)
|
| 97 |
+
scf.yield %_tmp4_67 : tensor<64x8xf32> loc(#loc35)
|
| 98 |
+
} loc(#loc76)
|
| 99 |
+
%tmp4 = tt.call @"triton.language.standard.sum__fp32S64_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc97)
|
| 100 |
+
%tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc98)
|
| 101 |
+
%tmp7 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc99)
|
| 102 |
+
%tmp7_19 = tt.addptr %tmp7, %xindex_7 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc99)
|
| 103 |
+
%tmp7_20 = tt.load %tmp7_19, %xmask_8 evictionPolicy = evict_last : tensor<64x1x!tt.ptr<f32>> loc(#loc100)
|
| 104 |
+
%tmp8 = arith.constant 0.693147182 : f32 loc(#loc101)
|
| 105 |
+
%tmp9 = arith.constant dense<0.693147182> : tensor<64x1xf32> loc(#loc102)
|
| 106 |
+
%tmp9_21 = arith.mulf %tmp7_20, %tmp9 : tensor<64x1xf32> loc(#loc102)
|
| 107 |
+
%tmp10 = arith.constant 1.44269502 : f32 loc(#loc103)
|
| 108 |
+
%tmp11 = arith.constant dense<1.44269502> : tensor<64x1xf32> loc(#loc104)
|
| 109 |
+
%tmp11_22 = arith.mulf %tmp9_21, %tmp11 : tensor<64x1xf32> loc(#loc104)
|
| 110 |
+
%tmp12 = arith.subf %tmp4_18, %tmp11_22 : tensor<64x1xf32> loc(#loc105)
|
| 111 |
+
%4 = tt.splat %out_ptr1 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc45)
|
| 112 |
+
%5 = tt.addptr %4, %xindex_7 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc45)
|
| 113 |
+
tt.store %5, %tmp12, %xmask_8 : tensor<64x1x!tt.ptr<f32>> loc(#loc46)
|
| 114 |
+
tt.return loc(#loc47)
|
| 115 |
+
} loc(#loc)
|
| 116 |
+
tt.func private @"triton.language.standard.sum__fp32S64_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x8xf32> loc("input"(#loc48))) -> tensor<64xf32> attributes {noinline = false} {
|
| 117 |
+
%0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
|
| 118 |
+
^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
|
| 119 |
+
%2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc49)
|
| 120 |
+
tt.reduce.return %2 : f32 loc(#loc49)
|
| 121 |
+
}) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc49)
|
| 122 |
+
tt.return %0 : tensor<64xf32> loc(#loc51)
|
| 123 |
+
^bb1: // no predecessors
|
| 124 |
+
%1 = ub.poison : tensor<64xf32> loc(#loc52)
|
| 125 |
+
tt.return %1 : tensor<64xf32> loc(#loc52)
|
| 126 |
+
} loc(#loc48)
|
| 127 |
+
tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc53)), %b: f32 loc("b"(#loc53))) -> f32 attributes {noinline = false} {
|
| 128 |
+
%0 = arith.addf %a, %b : f32 loc(#loc54)
|
| 129 |
+
tt.return %0 : f32 loc(#loc55)
|
| 130 |
+
^bb1: // no predecessors
|
| 131 |
+
%1 = ub.poison : f32 loc(#loc56)
|
| 132 |
+
tt.return %1 : f32 loc(#loc56)
|
| 133 |
+
} loc(#loc53)
|
| 134 |
+
} loc(#loc)
|
| 135 |
+
#loc1 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":19:13)
|
| 136 |
+
#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":20:15)
|
| 137 |
+
#loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":23:28)
|
| 138 |
+
#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":23:33)
|
| 139 |
+
#loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":24:36)
|
| 140 |
+
#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":24:44)
|
| 141 |
+
#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":24:23)
|
| 142 |
+
#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":25:21)
|
| 143 |
+
#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":26:27)
|
| 144 |
+
#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":26:37)
|
| 145 |
+
#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":28:19)
|
| 146 |
+
#loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":29:19)
|
| 147 |
+
#loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":31:43)
|
| 148 |
+
#loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":32:40)
|
| 149 |
+
#loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":33:31)
|
| 150 |
+
#loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":34:29)
|
| 151 |
+
#loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:45)
|
| 152 |
+
#loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:41)
|
| 153 |
+
#loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:55)
|
| 154 |
+
#loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:50)
|
| 155 |
+
#loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:34)
|
| 156 |
+
#loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:70)
|
| 157 |
+
#loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:60)
|
| 158 |
+
#loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:122)
|
| 159 |
+
#loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":39:45)
|
| 160 |
+
#loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":39:41)
|
| 161 |
+
#loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":39:34)
|
| 162 |
+
#loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":39:60)
|
| 163 |
+
#loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":39:50)
|
| 164 |
+
#loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":39:112)
|
| 165 |
+
#loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":40:22)
|
| 166 |
+
#loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":42:23)
|
| 167 |
+
#loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":43:35)
|
| 168 |
+
#loc34 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":43:48)
|
| 169 |
+
#loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":43:8)
|
| 170 |
+
#loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":44:25)
|
| 171 |
+
#loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":44:28)
|
| 172 |
+
#loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":45:30)
|
| 173 |
+
#loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":45:35)
|
| 174 |
+
#loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":47:11)
|
| 175 |
+
#loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":48:18)
|
| 176 |
+
#loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":49:12)
|
| 177 |
+
#loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":50:19)
|
| 178 |
+
#loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":51:19)
|
| 179 |
+
#loc45 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":52:25)
|
| 180 |
+
#loc46 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":52:37)
|
| 181 |
+
#loc47 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":52:4)
|
| 182 |
+
#loc49 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 183 |
+
#loc51 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11)
|
| 184 |
+
#loc52 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4)
|
| 185 |
+
#loc54 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 186 |
+
#loc55 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11)
|
| 187 |
+
#loc56 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4)
|
| 188 |
+
#loc63 = loc("xnumel"(#loc1))
|
| 189 |
+
#loc64 = loc("r0_numel"(#loc2))
|
| 190 |
+
#loc65 = loc("xoffset"(#loc3))
|
| 191 |
+
#loc66 = loc("xoffset"(#loc4))
|
| 192 |
+
#loc67 = loc("xindex"(#loc5))
|
| 193 |
+
#loc68 = loc("xindex"(#loc6))
|
| 194 |
+
#loc69 = loc("xindex"(#loc7))
|
| 195 |
+
#loc70 = loc("xmask"(#loc8))
|
| 196 |
+
#loc71 = loc("r0_base"(#loc9))
|
| 197 |
+
#loc72 = loc("r0_base"(#loc10))
|
| 198 |
+
#loc73 = loc("x0"(#loc11))
|
| 199 |
+
#loc74 = loc("x1"(#loc12))
|
| 200 |
+
#loc75 = loc("_tmp4"(#loc13))
|
| 201 |
+
#loc76 = loc("_tmp4"(#loc14))
|
| 202 |
+
#loc77 = loc("r0_index"(#loc15))
|
| 203 |
+
#loc78 = loc("r0_mask"(#loc16))
|
| 204 |
+
#loc79 = loc("tmp0"(#loc17))
|
| 205 |
+
#loc80 = loc("tmp0"(#loc18))
|
| 206 |
+
#loc81 = loc("tmp0"(#loc19))
|
| 207 |
+
#loc82 = loc("tmp0"(#loc20))
|
| 208 |
+
#loc83 = loc("tmp0"(#loc21))
|
| 209 |
+
#loc84 = loc("tmp0"(#loc22))
|
| 210 |
+
#loc85 = loc("tmp0"(#loc23))
|
| 211 |
+
#loc86 = loc("tmp0"(#loc24))
|
| 212 |
+
#loc87 = loc("tmp1"(#loc25))
|
| 213 |
+
#loc88 = loc("tmp1"(#loc26))
|
| 214 |
+
#loc89 = loc("tmp1"(#loc27))
|
| 215 |
+
#loc90 = loc("tmp1"(#loc28))
|
| 216 |
+
#loc91 = loc("tmp1"(#loc29))
|
| 217 |
+
#loc92 = loc("tmp1"(#loc30))
|
| 218 |
+
#loc93 = loc("tmp2"(#loc31))
|
| 219 |
+
#loc94 = loc("tmp5"(#loc32))
|
| 220 |
+
#loc95 = loc("_tmp4"(#loc33))
|
| 221 |
+
#loc96 = loc("_tmp4"(#loc34))
|
| 222 |
+
#loc97 = loc("tmp4"(#loc36))
|
| 223 |
+
#loc98 = loc("tmp4"(#loc37))
|
| 224 |
+
#loc99 = loc("tmp7"(#loc38))
|
| 225 |
+
#loc100 = loc("tmp7"(#loc39))
|
| 226 |
+
#loc101 = loc("tmp8"(#loc40))
|
| 227 |
+
#loc102 = loc("tmp9"(#loc41))
|
| 228 |
+
#loc103 = loc("tmp10"(#loc42))
|
| 229 |
+
#loc104 = loc("tmp11"(#loc43))
|
| 230 |
+
#loc105 = loc("tmp12"(#loc44))
|
progress/SpecForge/cache/compiled_kernels/triton/2/55FURAY6LWNBQ2KK76SAMFCTAPMGJOZ6YFWFPHZ7UXAHP4Z4YGJA/triton_red_fused_mul_0.ttgir
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1]}>
|
| 2 |
+
#blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
|
| 3 |
+
#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":18:0)
|
| 4 |
+
#loc1 = loc(unknown)
|
| 5 |
+
#loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":44:25)
|
| 6 |
+
#loc42 = loc("in_ptr0"(#loc))
|
| 7 |
+
#loc43 = loc("in_ptr1"(#loc))
|
| 8 |
+
#loc44 = loc("in_ptr2"(#loc))
|
| 9 |
+
#loc45 = loc("out_ptr1"(#loc))
|
| 10 |
+
#loc46 = loc("xnumel"(#loc))
|
| 11 |
+
#loc47 = loc("r0_numel"(#loc))
|
| 12 |
+
#loc75 = loc("tmp4"(#loc31))
|
| 13 |
+
#loc83 = loc(callsite(#loc1 at #loc75))
|
| 14 |
+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
|
| 15 |
+
tt.func public @triton_red_fused_mul_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 16 |
+
%cst = arith.constant dense<0.693147182> : tensor<64x1xf32, #blocked> loc(#loc1)
|
| 17 |
+
%cst_0 = arith.constant dense<1.44269502> : tensor<64x1xf32, #blocked> loc(#loc1)
|
| 18 |
+
%cst_1 = arith.constant dense<54784> : tensor<64x1xi32, #blocked1> loc(#loc1)
|
| 19 |
+
%c64_i32 = arith.constant 64 : i32 loc(#loc1)
|
| 20 |
+
%cst_2 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked1> loc(#loc1)
|
| 21 |
+
%c8_i32 = arith.constant 8 : i32 loc(#loc1)
|
| 22 |
+
%c128_i32 = arith.constant 128 : i32 loc(#loc1)
|
| 23 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc1)
|
| 24 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked1> loc(#loc1)
|
| 25 |
+
%cst_4 = arith.constant dense<128> : tensor<1x8xi32, #blocked1> loc(#loc1)
|
| 26 |
+
%cst_5 = arith.constant dense<128> : tensor<64x1xi32, #blocked1> loc(#loc1)
|
| 27 |
+
%cst_6 = arith.constant dense<4096> : tensor<64x1xi32, #blocked1> loc(#loc1)
|
| 28 |
+
%cst_7 = arith.constant dense<1712> : tensor<64x1xi32, #blocked1> loc(#loc1)
|
| 29 |
+
%cst_8 = arith.constant dense<54784> : tensor<64x1xi32, #blocked> loc(#loc1)
|
| 30 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc48)
|
| 31 |
+
%xoffset_9 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc49)
|
| 32 |
+
%xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc50)
|
| 33 |
+
%xindex_10 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc50)
|
| 34 |
+
%xindex_11 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc50)
|
| 35 |
+
%xindex_12 = tt.expand_dims %xindex_10 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc50)
|
| 36 |
+
%xindex_13 = tt.splat %xoffset_9 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc51)
|
| 37 |
+
%xindex_14 = tt.splat %xoffset_9 : i32 -> tensor<64x1xi32, #blocked> loc(#loc51)
|
| 38 |
+
%xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<64x1xi32, #blocked1> loc(#loc51)
|
| 39 |
+
%xindex_16 = arith.addi %xindex_14, %xindex_12 : tensor<64x1xi32, #blocked> loc(#loc51)
|
| 40 |
+
%xmask = arith.cmpi slt, %xindex_15, %cst_1 : tensor<64x1xi32, #blocked1> loc(#loc52)
|
| 41 |
+
%xmask_17 = arith.cmpi slt, %xindex_16, %cst_8 : tensor<64x1xi32, #blocked> loc(#loc52)
|
| 42 |
+
%r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc53)
|
| 43 |
+
%r0_base_18 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x8xi32, #blocked1> loc(#loc53)
|
| 44 |
+
%x0 = arith.remsi %xindex_15, %cst_7 : tensor<64x1xi32, #blocked1> loc(#loc54)
|
| 45 |
+
%x1 = arith.divsi %xindex_15, %cst_7 : tensor<64x1xi32, #blocked1> loc(#loc55)
|
| 46 |
+
%tmp0 = arith.muli %x1, %cst_5 : tensor<64x1xi32, #blocked1> loc(#loc56)
|
| 47 |
+
%tmp0_19 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked1> -> tensor<64x8xi32, #blocked1> loc(#loc57)
|
| 48 |
+
%tmp0_20 = arith.muli %x0, %cst_6 : tensor<64x1xi32, #blocked1> loc(#loc58)
|
| 49 |
+
%tmp0_21 = tt.broadcast %tmp0_20 : tensor<64x1xi32, #blocked1> -> tensor<64x8xi32, #blocked1> loc(#loc59)
|
| 50 |
+
%tmp0_22 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>, #blocked1> loc(#loc60)
|
| 51 |
+
%tmp0_23 = tt.broadcast %xmask : tensor<64x1xi1, #blocked1> -> tensor<64x8xi1, #blocked1> loc(#loc61)
|
| 52 |
+
%tmp1 = arith.muli %xindex_15, %cst_5 : tensor<64x1xi32, #blocked1> loc(#loc62)
|
| 53 |
+
%tmp1_24 = tt.broadcast %tmp1 : tensor<64x1xi32, #blocked1> -> tensor<64x8xi32, #blocked1> loc(#loc63)
|
| 54 |
+
%tmp1_25 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>, #blocked1> loc(#loc64)
|
| 55 |
+
%_tmp4 = scf.for %_tmp4_30 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg7 = %cst_3) -> (tensor<64x8xf32, #blocked1>) : i32 {
|
| 56 |
+
%r0_index = tt.splat %_tmp4_30 : i32 -> tensor<1x8xi32, #blocked1> loc(#loc66)
|
| 57 |
+
%r0_index_31 = arith.addi %r0_index, %r0_base_18 : tensor<1x8xi32, #blocked1> loc(#loc66)
|
| 58 |
+
%r0_mask = arith.cmpi slt, %r0_index_31, %cst_4 : tensor<1x8xi32, #blocked1> loc(#loc67)
|
| 59 |
+
%tmp0_32 = tt.broadcast %r0_index_31 : tensor<1x8xi32, #blocked1> -> tensor<64x8xi32, #blocked1> loc(#loc57)
|
| 60 |
+
%tmp0_33 = arith.addi %tmp0_32, %tmp0_19 : tensor<64x8xi32, #blocked1> loc(#loc57)
|
| 61 |
+
%tmp0_34 = arith.addi %tmp0_33, %tmp0_21 : tensor<64x8xi32, #blocked1> loc(#loc59)
|
| 62 |
+
%tmp0_35 = tt.addptr %tmp0_22, %tmp0_34 : tensor<64x8x!tt.ptr<bf16>, #blocked1>, tensor<64x8xi32, #blocked1> loc(#loc60)
|
| 63 |
+
%tmp0_36 = tt.broadcast %r0_mask : tensor<1x8xi1, #blocked1> -> tensor<64x8xi1, #blocked1> loc(#loc61)
|
| 64 |
+
%tmp0_37 = arith.andi %tmp0_36, %tmp0_23 : tensor<64x8xi1, #blocked1> loc(#loc61)
|
| 65 |
+
%tmp0_38 = tt.load %tmp0_35, %tmp0_37, %cst_2 evictionPolicy = evict_first : tensor<64x8x!tt.ptr<bf16>, #blocked1> loc(#loc68)
|
| 66 |
+
%tmp0_39 = arith.extf %tmp0_38 : tensor<64x8xbf16, #blocked1> to tensor<64x8xf32, #blocked1> loc(#loc69)
|
| 67 |
+
%tmp1_40 = arith.addi %tmp0_32, %tmp1_24 : tensor<64x8xi32, #blocked1> loc(#loc63)
|
| 68 |
+
%tmp1_41 = tt.addptr %tmp1_25, %tmp1_40 : tensor<64x8x!tt.ptr<bf16>, #blocked1>, tensor<64x8xi32, #blocked1> loc(#loc64)
|
| 69 |
+
%tmp1_42 = tt.load %tmp1_41, %tmp0_37, %cst_2 evictionPolicy = evict_first : tensor<64x8x!tt.ptr<bf16>, #blocked1> loc(#loc70)
|
| 70 |
+
%tmp1_43 = arith.extf %tmp1_42 : tensor<64x8xbf16, #blocked1> to tensor<64x8xf32, #blocked1> loc(#loc71)
|
| 71 |
+
%tmp2 = arith.mulf %tmp0_39, %tmp1_43 : tensor<64x8xf32, #blocked1> loc(#loc72)
|
| 72 |
+
%tmp5 = arith.addf %arg7, %tmp2 : tensor<64x8xf32, #blocked1> loc(#loc73)
|
| 73 |
+
%_tmp4_44 = arith.select %tmp0_37, %tmp5, %arg7 : tensor<64x8xi1, #blocked1>, tensor<64x8xf32, #blocked1> loc(#loc74)
|
| 74 |
+
scf.yield %_tmp4_44 : tensor<64x8xf32, #blocked1> loc(#loc29)
|
| 75 |
+
} loc(#loc65)
|
| 76 |
+
%tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
|
| 77 |
+
^bb0(%tmp4_30: f32 loc(callsite(#loc1 at #loc75)), %tmp4_31: f32 loc(callsite(#loc1 at #loc75))):
|
| 78 |
+
%tmp4_32 = arith.addf %tmp4_30, %tmp4_31 : f32 loc(#loc84)
|
| 79 |
+
tt.reduce.return %tmp4_32 : f32 loc(#loc82)
|
| 80 |
+
}) : (tensor<64x8xf32, #blocked1>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc82)
|
| 81 |
+
%tmp12 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc76)
|
| 82 |
+
%tmp4_26 = tt.expand_dims %tmp12 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xf32, #blocked> loc(#loc77)
|
| 83 |
+
%tmp7 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>, #blocked> loc(#loc78)
|
| 84 |
+
%tmp7_27 = tt.addptr %tmp7, %xindex_16 : tensor<64x1x!tt.ptr<f32>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc78)
|
| 85 |
+
%tmp7_28 = tt.load %tmp7_27, %xmask_17 evictionPolicy = evict_last : tensor<64x1x!tt.ptr<f32>, #blocked> loc(#loc79)
|
| 86 |
+
%tmp9 = arith.mulf %tmp7_28, %cst : tensor<64x1xf32, #blocked> loc(#loc80)
|
| 87 |
+
%tmp11 = arith.mulf %tmp9, %cst_0 : tensor<64x1xf32, #blocked> loc(#loc81)
|
| 88 |
+
%tmp12_29 = arith.subf %tmp4_26, %tmp11 : tensor<64x1xf32, #blocked> loc(#loc76)
|
| 89 |
+
%0 = tt.splat %out_ptr1 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>, #blocked> loc(#loc39)
|
| 90 |
+
%1 = tt.addptr %0, %xindex_16 : tensor<64x1x!tt.ptr<f32>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc39)
|
| 91 |
+
tt.store %1, %tmp12_29, %xmask_17 : tensor<64x1x!tt.ptr<f32>, #blocked> loc(#loc40)
|
| 92 |
+
tt.return loc(#loc41)
|
| 93 |
+
} loc(#loc)
|
| 94 |
+
} loc(#loc)
|
| 95 |
+
#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":23:28)
|
| 96 |
+
#loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":23:33)
|
| 97 |
+
#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":24:44)
|
| 98 |
+
#loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":24:23)
|
| 99 |
+
#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":25:21)
|
| 100 |
+
#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":26:37)
|
| 101 |
+
#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":28:19)
|
| 102 |
+
#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":29:19)
|
| 103 |
+
#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:45)
|
| 104 |
+
#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:41)
|
| 105 |
+
#loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:55)
|
| 106 |
+
#loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:50)
|
| 107 |
+
#loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:34)
|
| 108 |
+
#loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:70)
|
| 109 |
+
#loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":39:45)
|
| 110 |
+
#loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":39:41)
|
| 111 |
+
#loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":39:34)
|
| 112 |
+
#loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":32:40)
|
| 113 |
+
#loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":33:31)
|
| 114 |
+
#loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":34:29)
|
| 115 |
+
#loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:60)
|
| 116 |
+
#loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:122)
|
| 117 |
+
#loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":39:50)
|
| 118 |
+
#loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":39:112)
|
| 119 |
+
#loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":40:22)
|
| 120 |
+
#loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":42:23)
|
| 121 |
+
#loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":43:48)
|
| 122 |
+
#loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":43:8)
|
| 123 |
+
#loc30 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 124 |
+
#loc32 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 125 |
+
#loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":51:19)
|
| 126 |
+
#loc34 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":44:28)
|
| 127 |
+
#loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":45:30)
|
| 128 |
+
#loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":45:35)
|
| 129 |
+
#loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":48:18)
|
| 130 |
+
#loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":50:19)
|
| 131 |
+
#loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":52:25)
|
| 132 |
+
#loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":52:37)
|
| 133 |
+
#loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":52:4)
|
| 134 |
+
#loc48 = loc("xoffset"(#loc2))
|
| 135 |
+
#loc49 = loc("xoffset"(#loc3))
|
| 136 |
+
#loc50 = loc("xindex"(#loc4))
|
| 137 |
+
#loc51 = loc("xindex"(#loc5))
|
| 138 |
+
#loc52 = loc("xmask"(#loc6))
|
| 139 |
+
#loc53 = loc("r0_base"(#loc7))
|
| 140 |
+
#loc54 = loc("x0"(#loc8))
|
| 141 |
+
#loc55 = loc("x1"(#loc9))
|
| 142 |
+
#loc56 = loc("tmp0"(#loc10))
|
| 143 |
+
#loc57 = loc("tmp0"(#loc11))
|
| 144 |
+
#loc58 = loc("tmp0"(#loc12))
|
| 145 |
+
#loc59 = loc("tmp0"(#loc13))
|
| 146 |
+
#loc60 = loc("tmp0"(#loc14))
|
| 147 |
+
#loc61 = loc("tmp0"(#loc15))
|
| 148 |
+
#loc62 = loc("tmp1"(#loc16))
|
| 149 |
+
#loc63 = loc("tmp1"(#loc17))
|
| 150 |
+
#loc64 = loc("tmp1"(#loc18))
|
| 151 |
+
#loc65 = loc("_tmp4"(#loc19))
|
| 152 |
+
#loc66 = loc("r0_index"(#loc20))
|
| 153 |
+
#loc67 = loc("r0_mask"(#loc21))
|
| 154 |
+
#loc68 = loc("tmp0"(#loc22))
|
| 155 |
+
#loc69 = loc("tmp0"(#loc23))
|
| 156 |
+
#loc70 = loc("tmp1"(#loc24))
|
| 157 |
+
#loc71 = loc("tmp1"(#loc25))
|
| 158 |
+
#loc72 = loc("tmp2"(#loc26))
|
| 159 |
+
#loc73 = loc("tmp5"(#loc27))
|
| 160 |
+
#loc74 = loc("_tmp4"(#loc28))
|
| 161 |
+
#loc76 = loc("tmp12"(#loc33))
|
| 162 |
+
#loc77 = loc("tmp4"(#loc34))
|
| 163 |
+
#loc78 = loc("tmp7"(#loc35))
|
| 164 |
+
#loc79 = loc("tmp7"(#loc36))
|
| 165 |
+
#loc80 = loc("tmp9"(#loc37))
|
| 166 |
+
#loc81 = loc("tmp11"(#loc38))
|
| 167 |
+
#loc82 = loc(callsite(#loc30 at #loc75))
|
| 168 |
+
#loc84 = loc(callsite(#loc32 at #loc82))
|
progress/SpecForge/cache/compiled_kernels/triton/2/55FURAY6LWNBQ2KK76SAMFCTAPMGJOZ6YFWFPHZ7UXAHP4Z4YGJA/triton_red_fused_mul_0.ttir
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":18:0)
|
| 2 |
+
#loc1 = loc(unknown)
|
| 3 |
+
#loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":44:25)
|
| 4 |
+
#loc44 = loc("in_ptr0"(#loc))
|
| 5 |
+
#loc45 = loc("in_ptr1"(#loc))
|
| 6 |
+
#loc46 = loc("in_ptr2"(#loc))
|
| 7 |
+
#loc47 = loc("out_ptr1"(#loc))
|
| 8 |
+
#loc48 = loc("xnumel"(#loc))
|
| 9 |
+
#loc49 = loc("r0_numel"(#loc))
|
| 10 |
+
#loc81 = loc("tmp4"(#loc35))
|
| 11 |
+
#loc87 = loc(callsite(#loc1 at #loc81))
|
| 12 |
+
module {
|
| 13 |
+
tt.func public @triton_red_fused_mul_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
|
| 14 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<64x8xbf16> loc(#loc1)
|
| 15 |
+
%c8_i32 = arith.constant 8 : i32 loc(#loc2)
|
| 16 |
+
%c128_i32 = arith.constant 128 : i32 loc(#loc2)
|
| 17 |
+
%c0_i32 = arith.constant 0 : i32 loc(#loc2)
|
| 18 |
+
%tmp11 = arith.constant dense<1.44269502> : tensor<64x1xf32> loc(#loc50)
|
| 19 |
+
%tmp9 = arith.constant dense<0.693147182> : tensor<64x1xf32> loc(#loc51)
|
| 20 |
+
%cst_0 = arith.constant dense<4096> : tensor<64x1xi32> loc(#loc1)
|
| 21 |
+
%cst_1 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1)
|
| 22 |
+
%cst_2 = arith.constant dense<128> : tensor<1x8xi32> loc(#loc1)
|
| 23 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc1)
|
| 24 |
+
%cst_4 = arith.constant dense<1712> : tensor<64x1xi32> loc(#loc1)
|
| 25 |
+
%xmask = arith.constant dense<54784> : tensor<64x1xi32> loc(#loc52)
|
| 26 |
+
%c64_i32 = arith.constant 64 : i32 loc(#loc1)
|
| 27 |
+
%xoffset = tt.get_program_id x : i32 loc(#loc53)
|
| 28 |
+
%xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc54)
|
| 29 |
+
%xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc55)
|
| 30 |
+
%xindex_6 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc56)
|
| 31 |
+
%xindex_7 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32> loc(#loc57)
|
| 32 |
+
%xindex_8 = arith.addi %xindex_7, %xindex_6 : tensor<64x1xi32> loc(#loc57)
|
| 33 |
+
%xmask_9 = arith.cmpi slt, %xindex_8, %xmask : tensor<64x1xi32> loc(#loc52)
|
| 34 |
+
%r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc58)
|
| 35 |
+
%r0_base_10 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32> -> tensor<1x8xi32> loc(#loc59)
|
| 36 |
+
%x0 = arith.remsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc60)
|
| 37 |
+
%x1 = arith.divsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc61)
|
| 38 |
+
%_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%_tmp4_16 = %cst_3) -> (tensor<64x8xf32>) : i32 {
|
| 39 |
+
%r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32> loc(#loc63)
|
| 40 |
+
%r0_index_17 = arith.addi %r0_index, %r0_base_10 : tensor<1x8xi32> loc(#loc63)
|
| 41 |
+
%r0_mask = arith.cmpi slt, %r0_index_17, %cst_2 : tensor<1x8xi32> loc(#loc64)
|
| 42 |
+
%tmp0 = arith.muli %x1, %cst_1 : tensor<64x1xi32> loc(#loc65)
|
| 43 |
+
%tmp0_18 = tt.broadcast %r0_index_17 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc66)
|
| 44 |
+
%tmp0_19 = tt.broadcast %tmp0 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc66)
|
| 45 |
+
%tmp0_20 = arith.addi %tmp0_18, %tmp0_19 : tensor<64x8xi32> loc(#loc66)
|
| 46 |
+
%tmp0_21 = arith.muli %x0, %cst_0 : tensor<64x1xi32> loc(#loc67)
|
| 47 |
+
%tmp0_22 = tt.broadcast %tmp0_21 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc68)
|
| 48 |
+
%tmp0_23 = arith.addi %tmp0_20, %tmp0_22 : tensor<64x8xi32> loc(#loc68)
|
| 49 |
+
%tmp0_24 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc69)
|
| 50 |
+
%tmp0_25 = tt.addptr %tmp0_24, %tmp0_23 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc69)
|
| 51 |
+
%tmp0_26 = tt.broadcast %r0_mask : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc70)
|
| 52 |
+
%tmp0_27 = tt.broadcast %xmask_9 : tensor<64x1xi1> -> tensor<64x8xi1> loc(#loc70)
|
| 53 |
+
%tmp0_28 = arith.andi %tmp0_26, %tmp0_27 : tensor<64x8xi1> loc(#loc70)
|
| 54 |
+
%tmp0_29 = tt.load %tmp0_25, %tmp0_28, %cst evictionPolicy = evict_first : tensor<64x8x!tt.ptr<bf16>> loc(#loc71)
|
| 55 |
+
%tmp0_30 = arith.extf %tmp0_29 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc72)
|
| 56 |
+
%tmp1 = arith.muli %xindex_8, %cst_1 : tensor<64x1xi32> loc(#loc73)
|
| 57 |
+
%tmp1_31 = tt.broadcast %tmp1 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc74)
|
| 58 |
+
%tmp1_32 = arith.addi %tmp0_18, %tmp1_31 : tensor<64x8xi32> loc(#loc74)
|
| 59 |
+
%tmp1_33 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc75)
|
| 60 |
+
%tmp1_34 = tt.addptr %tmp1_33, %tmp1_32 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc75)
|
| 61 |
+
%tmp1_35 = tt.load %tmp1_34, %tmp0_28, %cst evictionPolicy = evict_first : tensor<64x8x!tt.ptr<bf16>> loc(#loc76)
|
| 62 |
+
%tmp1_36 = arith.extf %tmp1_35 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc77)
|
| 63 |
+
%tmp2 = arith.mulf %tmp0_30, %tmp1_36 : tensor<64x8xf32> loc(#loc78)
|
| 64 |
+
%tmp5 = arith.addf %_tmp4_16, %tmp2 : tensor<64x8xf32> loc(#loc79)
|
| 65 |
+
%_tmp4_37 = arith.select %tmp0_28, %tmp5, %_tmp4_16 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc80)
|
| 66 |
+
scf.yield %_tmp4_37 : tensor<64x8xf32> loc(#loc33)
|
| 67 |
+
} loc(#loc62)
|
| 68 |
+
%tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
|
| 69 |
+
^bb0(%tmp4_16: f32 loc(callsite(#loc1 at #loc81)), %tmp4_17: f32 loc(callsite(#loc1 at #loc81))):
|
| 70 |
+
%tmp4_18 = arith.addf %tmp4_16, %tmp4_17 : f32 loc(#loc88)
|
| 71 |
+
tt.reduce.return %tmp4_18 : f32 loc(#loc86)
|
| 72 |
+
}) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc86)
|
| 73 |
+
%tmp4_11 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc82)
|
| 74 |
+
%tmp7 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc83)
|
| 75 |
+
%tmp7_12 = tt.addptr %tmp7, %xindex_8 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc83)
|
| 76 |
+
%tmp7_13 = tt.load %tmp7_12, %xmask_9 evictionPolicy = evict_last : tensor<64x1x!tt.ptr<f32>> loc(#loc84)
|
| 77 |
+
%tmp9_14 = arith.mulf %tmp7_13, %tmp9 : tensor<64x1xf32> loc(#loc51)
|
| 78 |
+
%tmp11_15 = arith.mulf %tmp9_14, %tmp11 : tensor<64x1xf32> loc(#loc50)
|
| 79 |
+
%tmp12 = arith.subf %tmp4_11, %tmp11_15 : tensor<64x1xf32> loc(#loc85)
|
| 80 |
+
%0 = tt.splat %out_ptr1 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc41)
|
| 81 |
+
%1 = tt.addptr %0, %xindex_8 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc41)
|
| 82 |
+
tt.store %1, %tmp12, %xmask_9 : tensor<64x1x!tt.ptr<f32>> loc(#loc42)
|
| 83 |
+
tt.return loc(#loc43)
|
| 84 |
+
} loc(#loc)
|
| 85 |
+
} loc(#loc)
|
| 86 |
+
#loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":32:40)
|
| 87 |
+
#loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":50:19)
|
| 88 |
+
#loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":48:18)
|
| 89 |
+
#loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":25:21)
|
| 90 |
+
#loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":23:28)
|
| 91 |
+
#loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":23:33)
|
| 92 |
+
#loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":24:36)
|
| 93 |
+
#loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":24:44)
|
| 94 |
+
#loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":24:23)
|
| 95 |
+
#loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":26:27)
|
| 96 |
+
#loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":26:37)
|
| 97 |
+
#loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":28:19)
|
| 98 |
+
#loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":29:19)
|
| 99 |
+
#loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":33:31)
|
| 100 |
+
#loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":34:29)
|
| 101 |
+
#loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:45)
|
| 102 |
+
#loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:41)
|
| 103 |
+
#loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:55)
|
| 104 |
+
#loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:50)
|
| 105 |
+
#loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:34)
|
| 106 |
+
#loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:70)
|
| 107 |
+
#loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:60)
|
| 108 |
+
#loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":38:122)
|
| 109 |
+
#loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":39:45)
|
| 110 |
+
#loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":39:41)
|
| 111 |
+
#loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":39:34)
|
| 112 |
+
#loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":39:50)
|
| 113 |
+
#loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":39:112)
|
| 114 |
+
#loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":40:22)
|
| 115 |
+
#loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":42:23)
|
| 116 |
+
#loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":43:48)
|
| 117 |
+
#loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":43:8)
|
| 118 |
+
#loc34 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
|
| 119 |
+
#loc36 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
|
| 120 |
+
#loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":44:28)
|
| 121 |
+
#loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":45:30)
|
| 122 |
+
#loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":45:35)
|
| 123 |
+
#loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":51:19)
|
| 124 |
+
#loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":52:25)
|
| 125 |
+
#loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":52:37)
|
| 126 |
+
#loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/t3/ct3idxec76qtl3vshvvwnyu6rebkbok6z5sogtvlz6a62sxeduqp.py":52:4)
|
| 127 |
+
#loc50 = loc("tmp11"(#loc3))
|
| 128 |
+
#loc51 = loc("tmp9"(#loc4))
|
| 129 |
+
#loc52 = loc("xmask"(#loc5))
|
| 130 |
+
#loc53 = loc("xoffset"(#loc6))
|
| 131 |
+
#loc54 = loc("xoffset"(#loc7))
|
| 132 |
+
#loc55 = loc("xindex"(#loc8))
|
| 133 |
+
#loc56 = loc("xindex"(#loc9))
|
| 134 |
+
#loc57 = loc("xindex"(#loc10))
|
| 135 |
+
#loc58 = loc("r0_base"(#loc11))
|
| 136 |
+
#loc59 = loc("r0_base"(#loc12))
|
| 137 |
+
#loc60 = loc("x0"(#loc13))
|
| 138 |
+
#loc61 = loc("x1"(#loc14))
|
| 139 |
+
#loc62 = loc("_tmp4"(#loc2))
|
| 140 |
+
#loc63 = loc("r0_index"(#loc15))
|
| 141 |
+
#loc64 = loc("r0_mask"(#loc16))
|
| 142 |
+
#loc65 = loc("tmp0"(#loc17))
|
| 143 |
+
#loc66 = loc("tmp0"(#loc18))
|
| 144 |
+
#loc67 = loc("tmp0"(#loc19))
|
| 145 |
+
#loc68 = loc("tmp0"(#loc20))
|
| 146 |
+
#loc69 = loc("tmp0"(#loc21))
|
| 147 |
+
#loc70 = loc("tmp0"(#loc22))
|
| 148 |
+
#loc71 = loc("tmp0"(#loc23))
|
| 149 |
+
#loc72 = loc("tmp0"(#loc24))
|
| 150 |
+
#loc73 = loc("tmp1"(#loc25))
|
| 151 |
+
#loc74 = loc("tmp1"(#loc26))
|
| 152 |
+
#loc75 = loc("tmp1"(#loc27))
|
| 153 |
+
#loc76 = loc("tmp1"(#loc28))
|
| 154 |
+
#loc77 = loc("tmp1"(#loc29))
|
| 155 |
+
#loc78 = loc("tmp2"(#loc30))
|
| 156 |
+
#loc79 = loc("tmp5"(#loc31))
|
| 157 |
+
#loc80 = loc("_tmp4"(#loc32))
|
| 158 |
+
#loc82 = loc("tmp4"(#loc37))
|
| 159 |
+
#loc83 = loc("tmp7"(#loc38))
|
| 160 |
+
#loc84 = loc("tmp7"(#loc39))
|
| 161 |
+
#loc85 = loc("tmp12"(#loc40))
|
| 162 |
+
#loc86 = loc(callsite(#loc34 at #loc81))
|
| 163 |
+
#loc88 = loc(callsite(#loc36 at #loc86))
|
progress/SpecForge/cache/compiled_kernels/triton/2/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/__grp__triton_tem_fused_0.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"child_paths": {"triton_tem_fused_0.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.source", "triton_tem_fused_0.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ttir", "triton_tem_fused_0.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ttgir", "triton_tem_fused_0.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.llir", "triton_tem_fused_0.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ptx", "triton_tem_fused_0.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.cubin", "triton_tem_fused_0.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/2/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.json"}}
|
progress/SpecForge/cache/compiled_kernels/triton/2/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"hash": "ef65677dccb0fd1ca33e2efd85dd27b554735f6893116cae461084f5b56323fe", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 196608, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_tem_fused_0"}
|
progress/SpecForge/cache/compiled_kernels/triton/2/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.llir
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|