Lekr0 commited on
Commit
c13c5e7
·
verified ·
1 Parent(s): e498747

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. progress/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/__grp__triton_tem_fused_0.json +1 -0
  2. progress/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.json +1 -0
  3. progress/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.llir +0 -0
  4. progress/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ptx +0 -0
  5. progress/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.source +0 -0
  6. progress/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttgir +936 -0
  7. progress/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttir +780 -0
  8. progress/SpecForge/cache/compiled_kernels/triton/3/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/__grp__triton_per_fused_mul_1.json +1 -0
  9. progress/SpecForge/cache/compiled_kernels/triton/3/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.cubin +0 -0
  10. progress/SpecForge/cache/compiled_kernels/triton/3/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.json +1 -0
  11. progress/SpecForge/cache/compiled_kernels/triton/3/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.llir +1190 -0
  12. progress/SpecForge/cache/compiled_kernels/triton/3/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.ptx +1141 -0
  13. progress/SpecForge/cache/compiled_kernels/triton/3/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.source +391 -0
  14. progress/SpecForge/cache/compiled_kernels/triton/3/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.ttgir +239 -0
  15. progress/SpecForge/cache/compiled_kernels/triton/3/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.ttir +229 -0
  16. progress/SpecForge/cache/compiled_kernels/triton/3/2GPX4H2RCLQEV2LIJWPK5V4BGHUBOBJI3FUMPIZKQJ7CMB34OTAA/__grp__triton_per_fused_mul_1.json +1 -0
  17. progress/SpecForge/cache/compiled_kernels/triton/3/2GPX4H2RCLQEV2LIJWPK5V4BGHUBOBJI3FUMPIZKQJ7CMB34OTAA/triton_per_fused_mul_1.cubin +0 -0
  18. progress/SpecForge/cache/compiled_kernels/triton/3/2GPX4H2RCLQEV2LIJWPK5V4BGHUBOBJI3FUMPIZKQJ7CMB34OTAA/triton_per_fused_mul_1.json +1 -0
  19. progress/SpecForge/cache/compiled_kernels/triton/3/2GPX4H2RCLQEV2LIJWPK5V4BGHUBOBJI3FUMPIZKQJ7CMB34OTAA/triton_per_fused_mul_1.llir +1153 -0
  20. progress/SpecForge/cache/compiled_kernels/triton/3/2GPX4H2RCLQEV2LIJWPK5V4BGHUBOBJI3FUMPIZKQJ7CMB34OTAA/triton_per_fused_mul_1.ptx +1060 -0
  21. progress/SpecForge/cache/compiled_kernels/triton/3/2GPX4H2RCLQEV2LIJWPK5V4BGHUBOBJI3FUMPIZKQJ7CMB34OTAA/triton_per_fused_mul_1.source +302 -0
  22. progress/SpecForge/cache/compiled_kernels/triton/3/2GPX4H2RCLQEV2LIJWPK5V4BGHUBOBJI3FUMPIZKQJ7CMB34OTAA/triton_per_fused_mul_1.ttgir +174 -0
  23. progress/SpecForge/cache/compiled_kernels/triton/3/2GPX4H2RCLQEV2LIJWPK5V4BGHUBOBJI3FUMPIZKQJ7CMB34OTAA/triton_per_fused_mul_1.ttir +165 -0
  24. progress/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/__grp__triton_poi_fused_mul_1.json +1 -0
  25. progress/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.cubin +0 -0
  26. progress/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.json +1 -0
  27. progress/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.llir +89 -0
  28. progress/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ptx +357 -0
  29. progress/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.source +130 -0
  30. progress/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttgir +105 -0
  31. progress/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttir +104 -0
  32. progress/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/__grp__triton_tem_fused_0.json +1 -0
  33. progress/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.json +1 -0
  34. progress/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.llir +0 -0
  35. progress/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ptx +0 -0
  36. progress/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.source +0 -0
  37. progress/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ttgir +0 -0
  38. progress/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ttir +896 -0
  39. progress/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/__grp__triton_tem_fused_mul_1.json +1 -0
  40. progress/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.json +1 -0
  41. progress/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.llir +0 -0
  42. progress/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.ptx +0 -0
  43. progress/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.source +0 -0
  44. progress/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.ttgir +0 -0
  45. progress/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.ttir +0 -0
  46. progress/SpecForge/cache/compiled_kernels/triton/3/5GHJ3QO6IQW65MAR65YJDHQXMEEKX2KGGUQWS5XUVWXZI7RAC2YA/__grp__triton_per_fused_2.json +1 -0
  47. progress/SpecForge/cache/compiled_kernels/triton/3/5GHJ3QO6IQW65MAR65YJDHQXMEEKX2KGGUQWS5XUVWXZI7RAC2YA/triton_per_fused_2.cubin +0 -0
  48. progress/SpecForge/cache/compiled_kernels/triton/3/5GHJ3QO6IQW65MAR65YJDHQXMEEKX2KGGUQWS5XUVWXZI7RAC2YA/triton_per_fused_2.json +1 -0
  49. progress/SpecForge/cache/compiled_kernels/triton/3/5GHJ3QO6IQW65MAR65YJDHQXMEEKX2KGGUQWS5XUVWXZI7RAC2YA/triton_per_fused_2.llir +167 -0
  50. progress/SpecForge/cache/compiled_kernels/triton/3/5GHJ3QO6IQW65MAR65YJDHQXMEEKX2KGGUQWS5XUVWXZI7RAC2YA/triton_per_fused_2.ptx +389 -0
progress/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/__grp__triton_tem_fused_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_tem_fused_0.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.source", "triton_tem_fused_0.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttir", "triton_tem_fused_0.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttgir", "triton_tem_fused_0.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.llir", "triton_tem_fused_0.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ptx", "triton_tem_fused_0.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.cubin", "triton_tem_fused_0.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.json"}}
progress/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "d6d8579bc758efaefba73018088fcd8f749b5f25c3ab12d02ac240e2311ac409", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 131072, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_tem_fused_0"}
progress/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.llir ADDED
The diff for this file is too large to render. See raw diff
 
progress/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ptx ADDED
The diff for this file is too large to render. See raw diff
 
progress/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.source ADDED
The diff for this file is too large to render. See raw diff
 
progress/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttgir ADDED
@@ -0,0 +1,936 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}>
2
+ #blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
3
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":18:0)
4
+ #loc1 = loc(unknown)
5
+ #loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":520:16)
6
+ #loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":172:41)
7
+ #loc83 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":403:51)
8
+ #loc95 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":416:34)
9
+ #loc131 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":198:45)
10
+ #mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 64, 16]}>
11
+ #mma1 = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 128, 16]}>
12
+ #shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
13
+ #shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}>
14
+ #smem = #ttg.shared_memory
15
+ #loc152 = loc("arg_Q"(#loc))
16
+ #loc153 = loc("arg_K"(#loc))
17
+ #loc154 = loc("arg_V"(#loc))
18
+ #loc155 = loc("arg_LSE"(#loc))
19
+ #loc156 = loc("arg_MAX"(#loc))
20
+ #loc157 = loc("arg_KV_NUM_BLKS"(#loc))
21
+ #loc158 = loc("arg_KV_IDX"(#loc))
22
+ #loc159 = loc("arg_FULL_KV_NUM_BLKS"(#loc))
23
+ #loc160 = loc("arg_FULL_KV_IDX"(#loc))
24
+ #loc161 = loc("out_ptr0"(#loc))
25
+ #loc162 = loc("ks0"(#loc))
26
+ #loc163 = loc("ks1"(#loc))
27
+ #loc164 = loc("ks2"(#loc))
28
+ #loc165 = loc("ks3"(#loc))
29
+ #loc166 = loc("ks4"(#loc))
30
+ #loc200 = loc(callsite(#loc41 at #loc42))
31
+ #loc239 = loc("m_ij"(#loc83))
32
+ #loc249 = loc("l_i"(#loc95))
33
+ #loc283 = loc(callsite(#loc41 at #loc131))
34
+ #loc345 = loc(callsite(#loc239 at #loc200))
35
+ #loc355 = loc(callsite(#loc249 at #loc200))
36
+ #loc374 = loc(callsite(#loc239 at #loc283))
37
+ #loc384 = loc(callsite(#loc249 at #loc283))
38
+ #loc406 = loc(callsite(#loc1 at #loc345))
39
+ #loc408 = loc(callsite(#loc1 at #loc355))
40
+ #loc436 = loc(callsite(#loc1 at #loc374))
41
+ #loc438 = loc(callsite(#loc1 at #loc384))
42
+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
43
+ tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_MAX: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("arg_MAX"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc)), %ks2: i32 loc("ks2"(#loc)), %ks3: i32 loc("ks3"(#loc)), %ks4: i32 loc("ks4"(#loc))) attributes {noinline = false} {
44
+ %cst = arith.constant dense<0> : tensor<1x64xi32, #mma> loc(#loc1)
45
+ %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #mma> loc(#loc1)
46
+ %cst_1 = arith.constant dense<1> : tensor<128x1xi32, #mma> loc(#loc1)
47
+ %cst_2 = arith.constant dense<1> : tensor<1x64xi32, #mma> loc(#loc1)
48
+ %cst_3 = arith.constant dense<false> : tensor<128x64xi1, #mma> loc(#loc1)
49
+ %cst_4 = arith.constant dense<16> : tensor<1x64xi32, #mma> loc(#loc1)
50
+ %cst_5 = arith.constant dense<16> : tensor<128x1xi32, #mma> loc(#loc1)
51
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> loc(#loc1)
52
+ %cst_7 = arith.constant dense<1024> : tensor<64x1xi32, #blocked> loc(#loc1)
53
+ %cst_8 = arith.constant dense<4096> : tensor<128x1xi32, #blocked> loc(#loc1)
54
+ %cst_9 = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1)
55
+ %c2_i32 = arith.constant 2 : i32 loc(#loc1)
56
+ %c4_i32 = arith.constant 4 : i32 loc(#loc1)
57
+ %c32_i32 = arith.constant 32 : i32 loc(#loc1)
58
+ %c1_i32 = arith.constant 1 : i32 loc(#loc1)
59
+ %c128_i32 = arith.constant 128 : i32 loc(#loc1)
60
+ %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
61
+ %cst_10 = arith.constant dense<0.000000e+00> : tensor<128x128xbf16, #blocked> loc(#loc1)
62
+ %cst_11 = arith.constant dense<0.000000e+00> : tensor<64x128xbf16, #blocked> loc(#loc1)
63
+ %c64_i32 = arith.constant 64 : i32 loc(#loc1)
64
+ %c63_i32 = arith.constant 63 : i32 loc(#loc1)
65
+ %c0_i32 = arith.constant 0 : i32 loc(#loc1)
66
+ %cst_12 = arith.constant dense<0.000000e+00> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1)
67
+ %cst_13 = arith.constant dense<1.000000e+00> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1)
68
+ %cst_14 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma1> loc(#loc1)
69
+ %cst_15 = arith.constant dense<0.0883883461> : tensor<128x64xf32, #mma> loc(#loc1)
70
+ %cst_16 = arith.constant dense<0xFF800000> : tensor<128x64xf32, #mma> loc(#loc1)
71
+ %cst_17 = arith.constant dense<1.44269502> : tensor<128x64xf32, #mma> loc(#loc1)
72
+ %cst_18 = arith.constant dense<0xFF800000> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1)
73
+ %c-1_i32 = arith.constant -1 : i32 loc(#loc1)
74
+ %c3_i32 = arith.constant 3 : i32 loc(#loc1)
75
+ %0 = arith.muli %ks0, %c4096_i32 : i32 loc(#loc2)
76
+ %q_start = tt.get_program_id x : i32 loc(#loc167)
77
+ %off_zq = tt.get_program_id y : i32 loc(#loc168)
78
+ %off_hq = tt.get_program_id z : i32 loc(#loc169)
79
+ %off_hkv = arith.divsi %off_hq, %c4_i32 : i32 loc(#loc170)
80
+ %q_offset = arith.muli %off_zq, %0 : i32 loc(#loc171)
81
+ %q_offset_19 = arith.muli %off_hq, %c128_i32 : i32 loc(#loc172)
82
+ %q_offset_20 = arith.addi %q_offset, %q_offset_19 : i32 loc(#loc173)
83
+ %k_offset = arith.muli %off_hkv, %c128_i32 : i32 loc(#loc174)
84
+ %Q = tt.addptr %arg_Q, %q_offset_20 : !tt.ptr<bf16>, i32 loc(#loc175)
85
+ %K = tt.addptr %arg_K, %k_offset : !tt.ptr<bf16>, i32 loc(#loc176)
86
+ %V = tt.addptr %arg_V, %k_offset : !tt.ptr<bf16>, i32 loc(#loc177)
87
+ %sparse_kv_idx_offset = arith.muli %q_start, %ks4 : i32 loc(#loc178)
88
+ %offs_m = arith.muli %q_start, %c128_i32 : i32 loc(#loc179)
89
+ %offs_m_21 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc180)
90
+ %offs_m_22 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc180)
91
+ %offs_m_23 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked1> loc(#loc180)
92
+ %offs_m_24 = tt.splat %offs_m : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc181)
93
+ %offs_m_25 = tt.splat %offs_m : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc181)
94
+ %offs_m_26 = tt.splat %offs_m : i32 -> tensor<128xi32, #blocked1> loc(#loc181)
95
+ %offs_m_27 = arith.addi %offs_m_24, %offs_m_21 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc181)
96
+ %offs_m_28 = arith.addi %offs_m_25, %offs_m_22 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc181)
97
+ %offs_m_29 = arith.addi %offs_m_26, %offs_m_23 : tensor<128xi32, #blocked1> loc(#loc181)
98
+ %ptr = tt.expand_dims %offs_m_27 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc297)
99
+ %ptr_30 = tt.expand_dims %offs_m_28 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xi32, #mma> loc(#loc297)
100
+ %ptr_31 = arith.muli %ptr, %cst_8 : tensor<128x1xi32, #blocked> loc(#loc298)
101
+ %ptr_32 = tt.splat %Q : !tt.ptr<bf16> -> tensor<128x1x!tt.ptr<bf16>, #blocked> loc(#loc299)
102
+ %ptr_33 = tt.addptr %ptr_32, %ptr_31 : tensor<128x1x!tt.ptr<bf16>, #blocked>, tensor<128x1xi32, #blocked> loc(#loc299)
103
+ %ptr_34 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc300)
104
+ %ptr_35 = tt.expand_dims %ptr_34 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc300)
105
+ %ptr_36 = tt.broadcast %ptr_33 : tensor<128x1x!tt.ptr<bf16>, #blocked> -> tensor<128x128x!tt.ptr<bf16>, #blocked> loc(#loc301)
106
+ %ptr_37 = tt.broadcast %ptr_35 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc301)
107
+ %ptr_38 = tt.addptr %ptr_36, %ptr_37 : tensor<128x128x!tt.ptr<bf16>, #blocked>, tensor<128x128xi32, #blocked> loc(#loc301)
108
+ %q = tt.splat %ks0 : i32 -> tensor<128x1xi32, #blocked> loc(#loc302)
109
+ %q_39 = tt.splat %ks0 : i32 -> tensor<128x1xi32, #mma> loc(#loc302)
110
+ %q_40 = arith.cmpi slt, %ptr, %q : tensor<128x1xi32, #blocked> loc(#loc302)
111
+ %q_41 = tt.broadcast %q_40 : tensor<128x1xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc303)
112
+ %q_42 = tt.load %ptr_38, %q_41, %cst_10 : tensor<128x128x!tt.ptr<bf16>, #blocked> loc(#loc303)
113
+ %q_43 = ttg.local_alloc %q_42 : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc303)
114
+ %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset : !tt.ptr<i32>, i32 loc(#loc188)
115
+ %kv_start = tt.load %kv_indices : !tt.ptr<i32> loc(#loc189)
116
+ %kv_start_44 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc190)
117
+ %kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %q_start : !tt.ptr<i32>, i32 loc(#loc191)
118
+ %kv_num_blocks_45 = tt.load %kv_num_blocks : !tt.ptr<i32> loc(#loc192)
119
+ %block_n_end = arith.muli %kv_num_blocks_45, %c2_i32 : i32 loc(#loc193)
120
+ %block_n_end_46 = arith.addi %ks1, %c63_i32 : i32 loc(#loc304)
121
+ %block_n_end_47 = arith.divsi %block_n_end_46, %c64_i32 : i32 loc(#loc305)
122
+ %block_n_end_48 = arith.maxsi %block_n_end_47, %c1_i32 : i32 loc(#loc195)
123
+ %block_n_end_49 = arith.minsi %block_n_end, %block_n_end_48 : i32 loc(#loc196)
124
+ %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc197)
125
+ %offs_n_50 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc197)
126
+ %offs_n_51 = tt.splat %kv_start_44 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc198)
127
+ %offs_n_52 = arith.addi %offs_n_51, %offs_n : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc198)
128
+ %1 = tt.expand_dims %offs_n_52 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc39)
129
+ %ptr_53 = tt.splat %K : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>, #blocked> loc(#loc393)
130
+ %ptr_54 = tt.broadcast %ptr_35 : tensor<1x128xi32, #blocked> -> tensor<64x128xi32, #blocked> loc(#loc394)
131
+ %k = tt.splat %ks1 : i32 -> tensor<64x1xi32, #blocked> loc(#loc395)
132
+ %m = arith.remsi %ptr_30, %q_39 : tensor<128x1xi32, #mma> loc(#loc396)
133
+ %n = tt.splat %ks1 : i32 -> tensor<1x64xi32, #mma> loc(#loc397)
134
+ %tmp3 = arith.cmpi slt, %m, %cst_0 : tensor<128x1xi32, #mma> loc(#loc309)
135
+ %tmp5 = tt.broadcast %m : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc310)
136
+ %tmp6 = tt.broadcast %tmp3 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc311)
137
+ %tmp7 = arith.cmpi sge, %m, %cst_0 : tensor<128x1xi32, #mma> loc(#loc312)
138
+ %tmp9 = tt.broadcast %tmp7 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc313)
139
+ %tmp14 = arith.remsi %m, %cst_5 : tensor<128x1xi32, #mma> loc(#loc314)
140
+ %tmp14_55 = arith.cmpi ne, %tmp14, %cst_0 : tensor<128x1xi32, #mma> loc(#loc315)
141
+ %tmp14_56 = arith.divsi %m, %cst_5 : tensor<128x1xi32, #mma> loc(#loc316)
142
+ %tmp14_57 = arith.subi %tmp14_56, %cst_1 : tensor<128x1xi32, #mma> loc(#loc317)
143
+ %tmp14_58 = arith.select %tmp14_55, %tmp14_57, %tmp14_56 : tensor<128x1xi1, #mma>, tensor<128x1xi32, #mma> loc(#loc318)
144
+ %tmp14_59 = arith.select %tmp3, %tmp14_58, %tmp14_56 : tensor<128x1xi1, #mma>, tensor<128x1xi32, #mma> loc(#loc319)
145
+ %tmp17 = tt.broadcast %tmp14_59 : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc320)
146
+ %ptr_60 = tt.splat %V : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>, #blocked> loc(#loc398)
147
+ %k_61 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc399)
148
+ %v = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc400)
149
+ %kv_offset = arith.cmpi sgt, %block_n_end_49, %c0_i32 : i32 loc(#loc462)
150
+ %offs_n_load = tt.splat %kv_start_44 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
151
+ %offs_n_load_62 = arith.addi %offs_n_load, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
152
+ %ptr_63 = tt.expand_dims %offs_n_load_62 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc402)
153
+ %ptr_64 = arith.muli %ptr_63, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc403)
154
+ %ptr_65 = tt.addptr %ptr_53, %ptr_64 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc393)
155
+ %ptr_66 = tt.broadcast %ptr_65 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc394)
156
+ %ptr_67 = tt.addptr %ptr_66, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc394)
157
+ %k_68 = arith.cmpi slt, %ptr_63, %k : tensor<64x1xi32, #blocked> loc(#loc395)
158
+ %k_69 = tt.broadcast %k_68 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc399)
159
+ %k_70 = ttg.memdesc_index %k_61[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
160
+ %kv_offset_71 = tt.splat %kv_offset : i1 -> tensor<64x128xi1, #blocked> loc(#loc462)
161
+ %kv_offset_72 = arith.andi %kv_offset_71, %k_69 : tensor<64x128xi1, #blocked> loc(#loc462)
162
+ %k_73 = ttg.async_copy_global_to_local %ptr_67, %k_70 mask %kv_offset_72 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
163
+ %k_74 = ttg.async_commit_group tokens %k_73 loc(#loc399)
164
+ %ptr_75 = tt.addptr %ptr_60, %ptr_64 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc398)
165
+ %ptr_76 = tt.broadcast %ptr_75 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc404)
166
+ %ptr_77 = tt.addptr %ptr_76, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc404)
167
+ %v_78 = ttg.memdesc_index %v[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
168
+ %v_79 = ttg.async_copy_global_to_local %ptr_77, %v_78 mask %kv_offset_72 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
169
+ %v_80 = ttg.async_commit_group tokens %v_79 loc(#loc400)
170
+ %kv_offset_81 = arith.cmpi sgt, %block_n_end_49, %c1_i32 : i32 loc(#loc462)
171
+ %kv_base_offset = arith.addi %kv_start_44, %c64_i32 : i32 loc(#loc324)
172
+ %offs_n_load_82 = tt.splat %kv_base_offset : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
173
+ %offs_n_load_83 = arith.addi %offs_n_load_82, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
174
+ %ptr_84 = tt.expand_dims %offs_n_load_83 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc402)
175
+ %ptr_85 = arith.muli %ptr_84, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc403)
176
+ %ptr_86 = tt.addptr %ptr_53, %ptr_85 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc393)
177
+ %ptr_87 = tt.broadcast %ptr_86 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc394)
178
+ %ptr_88 = tt.addptr %ptr_87, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc394)
179
+ %k_89 = arith.cmpi slt, %ptr_84, %k : tensor<64x1xi32, #blocked> loc(#loc395)
180
+ %k_90 = tt.broadcast %k_89 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc399)
181
+ %k_91 = ttg.memdesc_index %k_61[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
182
+ %kv_offset_92 = tt.splat %kv_offset_81 : i1 -> tensor<64x128xi1, #blocked> loc(#loc462)
183
+ %kv_offset_93 = arith.andi %kv_offset_92, %k_90 : tensor<64x128xi1, #blocked> loc(#loc462)
184
+ %k_94 = ttg.async_copy_global_to_local %ptr_88, %k_91 mask %kv_offset_93 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
185
+ %k_95 = ttg.async_commit_group tokens %k_94 loc(#loc399)
186
+ %ptr_96 = tt.addptr %ptr_60, %ptr_85 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc398)
187
+ %ptr_97 = tt.broadcast %ptr_96 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc404)
188
+ %ptr_98 = tt.addptr %ptr_97, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc404)
189
+ %v_99 = ttg.memdesc_index %v[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
190
+ %v_100 = ttg.async_copy_global_to_local %ptr_98, %v_99 mask %kv_offset_93 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
191
+ %v_101 = ttg.async_commit_group tokens %v_100 loc(#loc400)
192
+ ttng.fence_async_shared {bCluster = false} loc(#loc325)
193
+ %kv_offset_102:12 = scf.for %kv_offset_173 = %c0_i32 to %block_n_end_49 step %c1_i32 iter_args(%acc_174 = %cst_14, %arg17 = %cst_12, %arg18 = %cst_18, %arg19 = %c64_i32, %arg20 = %1, %arg21 = %c1_i32, %arg22 = %c-1_i32, %k_175 = %k_74, %k_176 = %k_95, %v_177 = %v_80, %v_178 = %v_101, %arg27 = %c64_i32) -> (tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32) : i32 {
194
+ %kv_offset_179 = arith.subi %block_n_end_49, %c2_i32 : i32 loc(#loc462)
195
+ %kv_offset_180 = arith.cmpi slt, %kv_offset_173, %kv_offset_179 : i32 loc(#loc462)
196
+ %kv_offset_181 = arith.subi %block_n_end_49, %c1_i32 : i32 loc(#loc462)
197
+ %kv_offset_182 = arith.cmpi slt, %kv_offset_173, %kv_offset_181 : i32 loc(#loc462)
198
+ %kv_offset_183 = arith.addi %arg22, %c1_i32 : i32 loc(#loc462)
199
+ %kv_offset_184 = arith.cmpi sge, %kv_offset_183, %c3_i32 : i32 loc(#loc462)
200
+ %kv_offset_185 = arith.select %kv_offset_184, %c0_i32, %kv_offset_183 : i32 loc(#loc462)
201
+ %k_186 = ttg.async_wait %k_175, %v_177 {num = 2 : i32} loc(#loc399)
202
+ %k_187 = ttg.memdesc_index %k_61[%kv_offset_185] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
203
+ %k_188 = ttg.memdesc_trans %k_187 {order = array<i32: 1, 0>} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc326)
204
+ %qk = ttng.warp_group_dot %q_43, %k_188, %cst_6 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc325)
205
+ %qk_189:4 = ttng.warp_group_dot_wait %qk, %q_43, %k_188, %acc_174 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64>, tensor<128x128xf32, #mma1> loc(#loc325)
206
+ %qk_190 = arith.mulf %qk_189#0, %cst_15 : tensor<128x64xf32, #mma> loc(#loc327)
207
+ %n_191 = arith.remsi %arg20, %n : tensor<1x64xi32, #mma> loc(#loc397)
208
+ %post_mod_scores = arith.cmpi slt, %arg20, %n : tensor<1x64xi32, #mma> loc(#loc328)
209
+ %post_mod_scores_192 = tt.broadcast %post_mod_scores : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc329)
210
+ %post_mod_scores_193 = arith.select %post_mod_scores_192, %qk_190, %cst_16 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc329)
211
+ %tmp5_194 = tt.broadcast %n_191 : tensor<1x64xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc310)
212
+ %tmp5_195 = arith.cmpi sle, %tmp5_194, %tmp5 : tensor<128x64xi32, #mma> loc(#loc310)
213
+ %tmp6_196 = arith.andi %tmp6, %tmp5_195 : tensor<128x64xi1, #mma> loc(#loc311)
214
+ %tmp8 = arith.cmpi slt, %n_191, %cst : tensor<1x64xi32, #mma> loc(#loc330)
215
+ %tmp9_197 = tt.broadcast %tmp8 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc313)
216
+ %tmp9_198 = arith.andi %tmp9, %tmp9_197 : tensor<128x64xi1, #mma> loc(#loc313)
217
+ %tmp10 = arith.extui %tmp8 : tensor<1x64xi1, #mma> to tensor<1x64xi32, #mma> loc(#loc331)
218
+ %tmp10_199 = arith.cmpi eq, %tmp10, %cst : tensor<1x64xi32, #mma> loc(#loc331)
219
+ %tmp11 = tt.broadcast %tmp10_199 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc332)
220
+ %tmp11_200 = arith.andi %tmp9, %tmp11 : tensor<128x64xi1, #mma> loc(#loc332)
221
+ %tmp16 = arith.remsi %n_191, %cst_4 : tensor<1x64xi32, #mma> loc(#loc333)
222
+ %tmp16_201 = arith.cmpi ne, %tmp16, %cst : tensor<1x64xi32, #mma> loc(#loc334)
223
+ %tmp16_202 = arith.divsi %n_191, %cst_4 : tensor<1x64xi32, #mma> loc(#loc335)
224
+ %tmp16_203 = arith.subi %tmp16_202, %cst_2 : tensor<1x64xi32, #mma> loc(#loc336)
225
+ %tmp16_204 = arith.select %tmp16_201, %tmp16_203, %tmp16_202 : tensor<1x64xi1, #mma>, tensor<1x64xi32, #mma> loc(#loc337)
226
+ %tmp16_205 = arith.select %tmp8, %tmp16_204, %tmp16_202 : tensor<1x64xi1, #mma>, tensor<1x64xi32, #mma> loc(#loc338)
227
+ %tmp17_206 = tt.broadcast %tmp16_205 : tensor<1x64xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc320)
228
+ %tmp17_207 = arith.cmpi eq, %tmp17, %tmp17_206 : tensor<128x64xi32, #mma> loc(#loc320)
229
+ %tmp18 = arith.andi %tmp11_200, %tmp17_207 : tensor<128x64xi1, #mma> loc(#loc339)
230
+ %tmp19 = arith.ori %tmp9_198, %tmp18 : tensor<128x64xi1, #mma> loc(#loc340)
231
+ %tmp20 = arith.ori %tmp6_196, %tmp19 : tensor<128x64xi1, #mma> loc(#loc341)
232
+ %mask_mod_output = arith.select %post_mod_scores_192, %tmp20, %cst_3 : tensor<128x64xi1, #mma>, tensor<128x64xi1, #mma> loc(#loc342)
233
+ %post_mod_scores_208 = arith.select %mask_mod_output, %post_mod_scores_193, %cst_16 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc343)
234
+ %post_mod_scores_209 = arith.mulf %post_mod_scores_208, %cst_17 : tensor<128x64xf32, #mma> loc(#loc344)
235
+ %m_ij = "tt.reduce"(%post_mod_scores_209) <{axis = 1 : i32}> ({
236
+ ^bb0(%m_ij_267: f32 loc(callsite(#loc1 at #loc345)), %m_ij_268: f32 loc(callsite(#loc1 at #loc345))):
237
+ %m_ij_269 = arith.maxnumf %m_ij_267, %m_ij_268 : f32 loc(#loc457)
238
+ tt.reduce.return %m_ij_269 : f32 loc(#loc405)
239
+ }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc405)
240
+ %m_ij_210 = arith.maxnumf %arg18, %m_ij : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc346)
241
+ %masked_out_rows = arith.cmpf oeq, %m_ij_210, %cst_18 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc347)
242
+ %m_ij_masked = arith.select %masked_out_rows, %cst_12, %m_ij_210 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc348)
243
+ %alpha = arith.subf %arg18, %m_ij_masked : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc349)
244
+ %alpha_211 = math.exp2 %alpha : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc350)
245
+ %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc351)
246
+ %p_212 = tt.broadcast %p : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc352)
247
+ %p_213 = arith.subf %post_mod_scores_209, %p_212 : tensor<128x64xf32, #mma> loc(#loc352)
248
+ %p_214 = math.exp2 %p_213 : tensor<128x64xf32, #mma> loc(#loc353)
249
+ %l_i_215 = arith.mulf %arg17, %alpha_211 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc354)
250
+ %l_i_216 = "tt.reduce"(%p_214) <{axis = 1 : i32}> ({
251
+ ^bb0(%l_i_267: f32 loc(callsite(#loc1 at #loc355)), %l_i_268: f32 loc(callsite(#loc1 at #loc355))):
252
+ %l_i_269 = arith.addf %l_i_267, %l_i_268 : f32 loc(#loc458)
253
+ tt.reduce.return %l_i_269 : f32 loc(#loc407)
254
+ }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc407)
255
+ %l_i_217 = arith.addf %l_i_215, %l_i_216 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc356)
256
+ %acc_218 = tt.expand_dims %alpha_211 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc357)
257
+ %acc_219 = ttg.convert_layout %acc_218 : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc358)
258
+ %acc_220 = tt.broadcast %acc_219 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc358)
259
+ %acc_221 = arith.mulf %qk_189#3, %acc_220 : tensor<128x128xf32, #mma1> loc(#loc358)
260
+ %v_222 = ttg.memdesc_index %v[%kv_offset_185] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
261
+ %acc_223 = arith.truncf %p_214 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc359)
262
+ %acc_224 = ttg.convert_layout %acc_223 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc359)
263
+ %acc_225 = ttng.warp_group_dot %acc_224, %v_222, %acc_221 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc360)
264
+ %offs_n_226 = tt.splat %arg27 : i32 -> tensor<1x64xi32, #mma> loc(#loc361)
265
+ %offs_n_227 = arith.addi %arg20, %offs_n_226 : tensor<1x64xi32, #mma> loc(#loc361)
266
+ %kv_offset_228 = arith.addi %kv_offset_173, %c1_i32 : i32 loc(#loc462)
267
+ %cur_block_idx = arith.divsi %kv_offset_228, %c2_i32 : i32 loc(#loc409)
268
+ %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr<i32>, i32 loc(#loc410)
269
+ %cur_block_229 = tt.load %cur_block, %kv_offset_182 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc411)
270
+ %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc412)
271
+ %next_block_230 = arith.cmpi slt, %next_block, %kv_num_blocks_45 : i32 loc(#loc413)
272
+ %next_block_231 = tt.addptr %cur_block, %c1_i32 : !tt.ptr<i32>, i32 loc(#loc414)
273
+ %kv_offset_232 = arith.andi %kv_offset_182, %next_block_230 : i1 loc(#loc462)
274
+ %next_block_233 = tt.load %next_block_231, %kv_offset_232 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc415)
275
+ %needs_jump = arith.addi %kv_offset_173, %c2_i32 : i32 loc(#loc416)
276
+ %needs_jump_234 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc417)
277
+ %needs_jump_235 = arith.cmpi eq, %needs_jump_234, %c0_i32 : i32 loc(#loc418)
278
+ %jump_to_block = arith.subi %next_block_233, %cur_block_229 : i32 loc(#loc419)
279
+ %jump_to_block_236 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc420)
280
+ %jump_to_block_237 = arith.subi %jump_to_block_236, %c64_i32 : i32 loc(#loc421)
281
+ %offset = arith.extui %needs_jump_235 : i1 to i32 loc(#loc422)
282
+ %offset_238 = arith.muli %jump_to_block_237, %offset : i32 loc(#loc422)
283
+ %offset_239 = arith.subi %c1_i32, %offset : i32 loc(#loc423)
284
+ %offset_240 = arith.muli %offset_239, %c64_i32 : i32 loc(#loc424)
285
+ %offset_241 = arith.addi %offset_238, %offset_240 : i32 loc(#loc425)
286
+ %kv_offset_242 = arith.addi %arg19, %offset_241 : i32 loc(#loc363)
287
+ %kv_offset_243 = arith.addi %arg21, %c1_i32 : i32 loc(#loc462)
288
+ %kv_offset_244 = arith.cmpi sge, %kv_offset_243, %c3_i32 : i32 loc(#loc462)
289
+ %kv_offset_245 = arith.select %kv_offset_244, %c0_i32, %kv_offset_243 : i32 loc(#loc462)
290
+ %kv_base_offset_246 = arith.addi %kv_start_44, %kv_offset_242 : i32 loc(#loc324)
291
+ %offs_n_load_247 = tt.splat %kv_base_offset_246 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
292
+ %offs_n_load_248 = arith.addi %offs_n_load_247, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc323)
293
+ %ptr_249 = tt.expand_dims %offs_n_load_248 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc402)
294
+ %ptr_250 = arith.muli %ptr_249, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc403)
295
+ %ptr_251 = tt.addptr %ptr_53, %ptr_250 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc393)
296
+ %ptr_252 = tt.broadcast %ptr_251 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc394)
297
+ %ptr_253 = tt.addptr %ptr_252, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc394)
298
+ %k_254 = arith.cmpi slt, %ptr_249, %k : tensor<64x1xi32, #blocked> loc(#loc395)
299
+ %k_255 = tt.broadcast %k_254 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc399)
300
+ %k_256 = ttg.memdesc_index %k_61[%kv_offset_245] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
301
+ %kv_offset_257 = tt.splat %kv_offset_180 : i1 -> tensor<64x128xi1, #blocked> loc(#loc462)
302
+ %kv_offset_258 = arith.andi %kv_offset_257, %k_255 : tensor<64x128xi1, #blocked> loc(#loc462)
303
+ %k_259 = ttg.async_copy_global_to_local %ptr_253, %k_256 mask %kv_offset_258 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc399)
304
+ %k_260 = ttg.async_commit_group tokens %k_259 loc(#loc399)
305
+ %ptr_261 = tt.addptr %ptr_60, %ptr_250 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc398)
306
+ %ptr_262 = tt.broadcast %ptr_261 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc404)
307
+ %ptr_263 = tt.addptr %ptr_262, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc404)
308
+ %v_264 = ttg.memdesc_index %v[%kv_offset_245] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
309
+ %v_265 = ttg.async_copy_global_to_local %ptr_263, %v_264 mask %kv_offset_258 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc400)
310
+ %v_266 = ttg.async_commit_group tokens %v_265 loc(#loc400)
311
+ scf.yield %acc_225, %l_i_217, %m_ij_210, %kv_offset_242, %offs_n_227, %kv_offset_245, %kv_offset_185, %k_176, %k_260, %v_178, %v_266, %offset_241 : tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc462)
312
+ } loc(#loc462)
313
+ %kv_offset_103 = ttng.warp_group_dot_wait %kv_offset_102#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc462)
314
+ %kv_offset_104 = ttg.async_wait {num = 0 : i32} loc(#loc462)
315
+ ttg.local_dealloc %v : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc462)
316
+ ttg.local_dealloc %k_61 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc462)
317
+ %kv_indices_105 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset : !tt.ptr<i32>, i32 loc(#loc275)
318
+ %kv_start_106 = tt.load %kv_indices_105 : !tt.ptr<i32> loc(#loc276)
319
+ %kv_start_107 = arith.muli %kv_start_106, %c128_i32 : i32 loc(#loc277)
320
+ %kv_num_blocks_108 = tt.addptr %arg_FULL_KV_NUM_BLKS, %q_start : !tt.ptr<i32>, i32 loc(#loc278)
321
+ %kv_num_blocks_109 = tt.load %kv_num_blocks_108 : !tt.ptr<i32> loc(#loc279)
322
+ %block_n_end_110 = arith.muli %kv_num_blocks_109, %c2_i32 : i32 loc(#loc280)
323
+ %block_n_end_111 = arith.minsi %block_n_end_110, %block_n_end_48 : i32 loc(#loc281)
324
+ %offs_n_112 = tt.splat %kv_start_107 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc282)
325
+ %offs_n_113 = arith.addi %offs_n_112, %offs_n : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc282)
326
+ %2 = tt.expand_dims %offs_n_113 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc130)
327
+ %k_114 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc426)
328
+ %v_115 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc427)
329
+ %kv_offset_116 = arith.cmpi sgt, %block_n_end_111, %c0_i32 : i32 loc(#loc463)
330
+ %offs_n_load_117 = tt.splat %kv_start_107 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
331
+ %offs_n_load_118 = arith.addi %offs_n_load_117, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
332
+ %ptr_119 = tt.expand_dims %offs_n_load_118 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc428)
333
+ %ptr_120 = arith.muli %ptr_119, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc429)
334
+ %ptr_121 = tt.addptr %ptr_53, %ptr_120 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc430)
335
+ %ptr_122 = tt.broadcast %ptr_121 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc431)
336
+ %ptr_123 = tt.addptr %ptr_122, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc431)
337
+ %k_124 = arith.cmpi slt, %ptr_119, %k : tensor<64x1xi32, #blocked> loc(#loc432)
338
+ %k_125 = tt.broadcast %k_124 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc426)
339
+ %k_126 = ttg.memdesc_index %k_114[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
340
+ %kv_offset_127 = tt.splat %kv_offset_116 : i1 -> tensor<64x128xi1, #blocked> loc(#loc463)
341
+ %kv_offset_128 = arith.andi %kv_offset_127, %k_125 : tensor<64x128xi1, #blocked> loc(#loc463)
342
+ %k_129 = ttg.async_copy_global_to_local %ptr_123, %k_126 mask %kv_offset_128 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
343
+ %k_130 = ttg.async_commit_group tokens %k_129 loc(#loc426)
344
+ %ptr_131 = tt.addptr %ptr_60, %ptr_120 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc433)
345
+ %ptr_132 = tt.broadcast %ptr_131 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc434)
346
+ %ptr_133 = tt.addptr %ptr_132, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc434)
347
+ %v_134 = ttg.memdesc_index %v_115[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
348
+ %v_135 = ttg.async_copy_global_to_local %ptr_133, %v_134 mask %kv_offset_128 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
349
+ %v_136 = ttg.async_commit_group tokens %v_135 loc(#loc427)
350
+ %kv_offset_137 = arith.cmpi sgt, %block_n_end_111, %c1_i32 : i32 loc(#loc463)
351
+ %kv_base_offset_138 = arith.addi %kv_start_107, %c64_i32 : i32 loc(#loc367)
352
+ %offs_n_load_139 = tt.splat %kv_base_offset_138 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
353
+ %offs_n_load_140 = arith.addi %offs_n_load_139, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
354
+ %ptr_141 = tt.expand_dims %offs_n_load_140 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc428)
355
+ %ptr_142 = arith.muli %ptr_141, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc429)
356
+ %ptr_143 = tt.addptr %ptr_53, %ptr_142 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc430)
357
+ %ptr_144 = tt.broadcast %ptr_143 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc431)
358
+ %ptr_145 = tt.addptr %ptr_144, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc431)
359
+ %k_146 = arith.cmpi slt, %ptr_141, %k : tensor<64x1xi32, #blocked> loc(#loc432)
360
+ %k_147 = tt.broadcast %k_146 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc426)
361
+ %k_148 = ttg.memdesc_index %k_114[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
362
+ %kv_offset_149 = tt.splat %kv_offset_137 : i1 -> tensor<64x128xi1, #blocked> loc(#loc463)
363
+ %kv_offset_150 = arith.andi %kv_offset_149, %k_147 : tensor<64x128xi1, #blocked> loc(#loc463)
364
+ %k_151 = ttg.async_copy_global_to_local %ptr_145, %k_148 mask %kv_offset_150 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
365
+ %k_152 = ttg.async_commit_group tokens %k_151 loc(#loc426)
366
+ %ptr_153 = tt.addptr %ptr_60, %ptr_142 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc433)
367
+ %ptr_154 = tt.broadcast %ptr_153 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc434)
368
+ %ptr_155 = tt.addptr %ptr_154, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc434)
369
+ %v_156 = ttg.memdesc_index %v_115[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
370
+ %v_157 = ttg.async_copy_global_to_local %ptr_155, %v_156 mask %kv_offset_150 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
371
+ %v_158 = ttg.async_commit_group tokens %v_157 loc(#loc427)
372
+ ttng.fence_async_shared {bCluster = false} loc(#loc368)
373
+ %kv_offset_159:12 = scf.for %kv_offset_173 = %c0_i32 to %block_n_end_111 step %c1_i32 iter_args(%kv_offset_174 = %kv_offset_103, %kv_offset_175 = %kv_offset_102#1, %kv_offset_176 = %kv_offset_102#2, %arg19 = %c64_i32, %arg20 = %2, %arg21 = %c1_i32, %arg22 = %c-1_i32, %k_177 = %k_130, %k_178 = %k_152, %v_179 = %v_136, %v_180 = %v_158, %arg27 = %c64_i32) -> (tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32) : i32 {
374
+ %kv_offset_181 = arith.subi %block_n_end_111, %c2_i32 : i32 loc(#loc463)
375
+ %kv_offset_182 = arith.cmpi slt, %kv_offset_173, %kv_offset_181 : i32 loc(#loc463)
376
+ %kv_offset_183 = arith.subi %block_n_end_111, %c1_i32 : i32 loc(#loc463)
377
+ %kv_offset_184 = arith.cmpi slt, %kv_offset_173, %kv_offset_183 : i32 loc(#loc463)
378
+ %kv_offset_185 = arith.addi %arg22, %c1_i32 : i32 loc(#loc463)
379
+ %kv_offset_186 = arith.cmpi sge, %kv_offset_185, %c3_i32 : i32 loc(#loc463)
380
+ %kv_offset_187 = arith.select %kv_offset_186, %c0_i32, %kv_offset_185 : i32 loc(#loc463)
381
+ %k_188 = ttg.async_wait %k_177, %v_179 {num = 2 : i32} loc(#loc426)
382
+ %k_189 = ttg.memdesc_index %k_114[%kv_offset_187] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
383
+ %k_190 = ttg.memdesc_trans %k_189 {order = array<i32: 1, 0>} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc369)
384
+ %qk = ttng.warp_group_dot %q_43, %k_190, %cst_6 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc368)
385
+ %qk_191:4 = ttng.warp_group_dot_wait %qk, %q_43, %k_190, %kv_offset_174 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64>, tensor<128x128xf32, #mma1> loc(#loc368)
386
+ %qk_192 = arith.mulf %qk_191#0, %cst_15 : tensor<128x64xf32, #mma> loc(#loc370)
387
+ %post_mod_scores = arith.cmpi slt, %arg20, %n : tensor<1x64xi32, #mma> loc(#loc371)
388
+ %post_mod_scores_193 = tt.broadcast %post_mod_scores : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc372)
389
+ %post_mod_scores_194 = arith.select %post_mod_scores_193, %qk_192, %cst_16 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc372)
390
+ %post_mod_scores_195 = arith.mulf %post_mod_scores_194, %cst_17 : tensor<128x64xf32, #mma> loc(#loc373)
391
+ %m_ij = "tt.reduce"(%post_mod_scores_195) <{axis = 1 : i32}> ({
392
+ ^bb0(%m_ij_253: f32 loc(callsite(#loc1 at #loc374)), %m_ij_254: f32 loc(callsite(#loc1 at #loc374))):
393
+ %m_ij_255 = arith.maxnumf %m_ij_253, %m_ij_254 : f32 loc(#loc459)
394
+ tt.reduce.return %m_ij_255 : f32 loc(#loc435)
395
+ }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc435)
396
+ %m_ij_196 = arith.maxnumf %kv_offset_176, %m_ij : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc375)
397
+ %masked_out_rows = arith.cmpf oeq, %m_ij_196, %cst_18 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc376)
398
+ %m_ij_masked = arith.select %masked_out_rows, %cst_12, %m_ij_196 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc377)
399
+ %alpha = arith.subf %kv_offset_176, %m_ij_masked : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc378)
400
+ %alpha_197 = math.exp2 %alpha : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc379)
401
+ %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc380)
402
+ %p_198 = tt.broadcast %p : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc381)
403
+ %p_199 = arith.subf %post_mod_scores_195, %p_198 : tensor<128x64xf32, #mma> loc(#loc381)
404
+ %p_200 = math.exp2 %p_199 : tensor<128x64xf32, #mma> loc(#loc382)
405
+ %l_i_201 = arith.mulf %kv_offset_175, %alpha_197 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc383)
406
+ %l_i_202 = "tt.reduce"(%p_200) <{axis = 1 : i32}> ({
407
+ ^bb0(%l_i_253: f32 loc(callsite(#loc1 at #loc384)), %l_i_254: f32 loc(callsite(#loc1 at #loc384))):
408
+ %l_i_255 = arith.addf %l_i_253, %l_i_254 : f32 loc(#loc460)
409
+ tt.reduce.return %l_i_255 : f32 loc(#loc437)
410
+ }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc437)
411
+ %l_i_203 = arith.addf %l_i_201, %l_i_202 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc385)
412
+ %acc_204 = tt.expand_dims %alpha_197 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc386)
413
+ %acc_205 = ttg.convert_layout %acc_204 : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc387)
414
+ %acc_206 = tt.broadcast %acc_205 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc387)
415
+ %acc_207 = arith.mulf %qk_191#3, %acc_206 : tensor<128x128xf32, #mma1> loc(#loc387)
416
+ %v_208 = ttg.memdesc_index %v_115[%kv_offset_187] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
417
+ %acc_209 = arith.truncf %p_200 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc388)
418
+ %acc_210 = ttg.convert_layout %acc_209 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc388)
419
+ %acc_211 = ttng.warp_group_dot %acc_210, %v_208, %acc_207 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc389)
420
+ %offs_n_212 = tt.splat %arg27 : i32 -> tensor<1x64xi32, #mma> loc(#loc390)
421
+ %offs_n_213 = arith.addi %arg20, %offs_n_212 : tensor<1x64xi32, #mma> loc(#loc390)
422
+ %kv_offset_214 = arith.addi %kv_offset_173, %c1_i32 : i32 loc(#loc463)
423
+ %cur_block_idx = arith.divsi %kv_offset_214, %c2_i32 : i32 loc(#loc439)
424
+ %cur_block = tt.addptr %kv_indices_105, %cur_block_idx : !tt.ptr<i32>, i32 loc(#loc440)
425
+ %cur_block_215 = tt.load %cur_block, %kv_offset_184 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc441)
426
+ %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc442)
427
+ %next_block_216 = arith.cmpi slt, %next_block, %kv_num_blocks_109 : i32 loc(#loc443)
428
+ %next_block_217 = tt.addptr %cur_block, %c1_i32 : !tt.ptr<i32>, i32 loc(#loc444)
429
+ %kv_offset_218 = arith.andi %kv_offset_184, %next_block_216 : i1 loc(#loc463)
430
+ %next_block_219 = tt.load %next_block_217, %kv_offset_218 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc445)
431
+ %needs_jump = arith.addi %kv_offset_173, %c2_i32 : i32 loc(#loc446)
432
+ %needs_jump_220 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc447)
433
+ %needs_jump_221 = arith.cmpi eq, %needs_jump_220, %c0_i32 : i32 loc(#loc448)
434
+ %jump_to_block = arith.subi %next_block_219, %cur_block_215 : i32 loc(#loc449)
435
+ %jump_to_block_222 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc450)
436
+ %jump_to_block_223 = arith.subi %jump_to_block_222, %c64_i32 : i32 loc(#loc451)
437
+ %offset = arith.extui %needs_jump_221 : i1 to i32 loc(#loc452)
438
+ %offset_224 = arith.muli %jump_to_block_223, %offset : i32 loc(#loc452)
439
+ %offset_225 = arith.subi %c1_i32, %offset : i32 loc(#loc453)
440
+ %offset_226 = arith.muli %offset_225, %c64_i32 : i32 loc(#loc454)
441
+ %offset_227 = arith.addi %offset_224, %offset_226 : i32 loc(#loc455)
442
+ %kv_offset_228 = arith.addi %arg19, %offset_227 : i32 loc(#loc392)
443
+ %kv_offset_229 = arith.addi %arg21, %c1_i32 : i32 loc(#loc463)
444
+ %kv_offset_230 = arith.cmpi sge, %kv_offset_229, %c3_i32 : i32 loc(#loc463)
445
+ %kv_offset_231 = arith.select %kv_offset_230, %c0_i32, %kv_offset_229 : i32 loc(#loc463)
446
+ %kv_base_offset_232 = arith.addi %kv_start_107, %kv_offset_228 : i32 loc(#loc367)
447
+ %offs_n_load_233 = tt.splat %kv_base_offset_232 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
448
+ %offs_n_load_234 = arith.addi %offs_n_load_233, %offs_n_50 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc366)
449
+ %ptr_235 = tt.expand_dims %offs_n_load_234 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc428)
450
+ %ptr_236 = arith.muli %ptr_235, %cst_7 : tensor<64x1xi32, #blocked> loc(#loc429)
451
+ %ptr_237 = tt.addptr %ptr_53, %ptr_236 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc430)
452
+ %ptr_238 = tt.broadcast %ptr_237 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc431)
453
+ %ptr_239 = tt.addptr %ptr_238, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc431)
454
+ %k_240 = arith.cmpi slt, %ptr_235, %k : tensor<64x1xi32, #blocked> loc(#loc432)
455
+ %k_241 = tt.broadcast %k_240 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc426)
456
+ %k_242 = ttg.memdesc_index %k_114[%kv_offset_231] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
457
+ %kv_offset_243 = tt.splat %kv_offset_182 : i1 -> tensor<64x128xi1, #blocked> loc(#loc463)
458
+ %kv_offset_244 = arith.andi %kv_offset_243, %k_241 : tensor<64x128xi1, #blocked> loc(#loc463)
459
+ %k_245 = ttg.async_copy_global_to_local %ptr_239, %k_242 mask %kv_offset_244 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc426)
460
+ %k_246 = ttg.async_commit_group tokens %k_245 loc(#loc426)
461
+ %ptr_247 = tt.addptr %ptr_60, %ptr_236 : tensor<64x1x!tt.ptr<bf16>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc433)
462
+ %ptr_248 = tt.broadcast %ptr_247 : tensor<64x1x!tt.ptr<bf16>, #blocked> -> tensor<64x128x!tt.ptr<bf16>, #blocked> loc(#loc434)
463
+ %ptr_249 = tt.addptr %ptr_248, %ptr_54 : tensor<64x128x!tt.ptr<bf16>, #blocked>, tensor<64x128xi32, #blocked> loc(#loc434)
464
+ %v_250 = ttg.memdesc_index %v_115[%kv_offset_231] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
465
+ %v_251 = ttg.async_copy_global_to_local %ptr_249, %v_250 mask %kv_offset_244 other %cst_11 : tensor<64x128x!tt.ptr<bf16>, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc427)
466
+ %v_252 = ttg.async_commit_group tokens %v_251 loc(#loc427)
467
+ scf.yield %acc_211, %l_i_203, %m_ij_196, %kv_offset_228, %offs_n_213, %kv_offset_231, %kv_offset_187, %k_178, %k_246, %v_180, %v_252, %offset_227 : tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc463)
468
+ } loc(#loc463)
469
+ %kv_offset_160 = ttng.warp_group_dot_wait %kv_offset_159#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc463)
470
+ %kv_offset_161 = ttg.async_wait {num = 0 : i32} loc(#loc463)
471
+ ttg.local_dealloc %v_115 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc463)
472
+ ttg.local_dealloc %k_114 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc463)
473
+ %l_i = arith.cmpf oeq, %kv_offset_159#1, %cst_12 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc284)
474
+ %l_i_162 = arith.select %l_i, %cst_13, %kv_offset_159#1 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc285)
475
+ %acc = tt.expand_dims %l_i_162 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc286)
476
+ %acc_163 = ttg.convert_layout %acc : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc287)
477
+ %acc_164 = tt.broadcast %acc_163 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc287)
478
+ %acc_165 = arith.divf %kv_offset_160, %acc_164 : tensor<128x128xf32, #mma1> loc(#loc287)
479
+ %mask = arith.cmpi slt, %ptr_35, %cst_9 : tensor<1x128xi32, #blocked> loc(#loc288)
480
+ %mask_166 = tt.broadcast %mask : tensor<1x128xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc289)
481
+ %mask_167 = arith.andi %q_41, %mask_166 : tensor<128x128xi1, #blocked> loc(#loc289)
482
+ %3 = tt.splat %q_offset_19 : i32 -> tensor<1x128xi32, #blocked> loc(#loc138)
483
+ %4 = arith.addi %ptr_35, %3 : tensor<1x128xi32, #blocked> loc(#loc138)
484
+ %5 = tt.broadcast %4 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc139)
485
+ %6 = tt.broadcast %ptr_31 : tensor<128x1xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc139)
486
+ %7 = arith.addi %5, %6 : tensor<128x128xi32, #blocked> loc(#loc139)
487
+ %8 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<128x128x!tt.ptr<bf16>, #blocked> loc(#loc140)
488
+ %9 = tt.addptr %8, %7 : tensor<128x128x!tt.ptr<bf16>, #blocked>, tensor<128x128xi32, #blocked> loc(#loc140)
489
+ %10 = arith.truncf %acc_165 : tensor<128x128xf32, #mma1> to tensor<128x128xbf16, #mma1> loc(#loc141)
490
+ %11 = ttg.convert_layout %10 : tensor<128x128xbf16, #mma1> -> tensor<128x128xbf16, #blocked> loc(#loc141)
491
+ tt.store %9, %11, %mask_167 : tensor<128x128x!tt.ptr<bf16>, #blocked> loc(#loc141)
492
+ %off_hz = arith.muli %off_zq, %c32_i32 : i32 loc(#loc290)
493
+ %off_hz_168 = arith.addi %off_hz, %off_hq : i32 loc(#loc291)
494
+ %l_ptrs = arith.muli %off_hz_168, %ks0 : i32 loc(#loc292)
495
+ %l_ptrs_169 = tt.addptr %arg_LSE, %l_ptrs : !tt.ptr<f32>, i32 loc(#loc293)
496
+ %l_ptrs_170 = tt.splat %l_ptrs_169 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>, #blocked1> loc(#loc294)
497
+ %l_ptrs_171 = tt.addptr %l_ptrs_170, %offs_m_29 : tensor<128x!tt.ptr<f32>, #blocked1>, tensor<128xi32, #blocked1> loc(#loc294)
498
+ %lse = math.log2 %l_i_162 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc295)
499
+ %lse_172 = arith.addf %kv_offset_159#2, %lse : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc296)
500
+ %12 = tt.splat %ks0 : i32 -> tensor<128xi32, #blocked1> loc(#loc149)
501
+ %13 = arith.cmpi slt, %offs_m_29, %12 : tensor<128xi32, #blocked1> loc(#loc149)
502
+ %14 = ttg.convert_layout %lse_172 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128xf32, #blocked1> loc(#loc150)
503
+ tt.store %l_ptrs_171, %14, %13 : tensor<128x!tt.ptr<f32>, #blocked1> loc(#loc150)
504
+ tt.return loc(#loc151)
505
+ } loc(#loc)
506
+ } loc(#loc)
507
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":85:54)
508
+ #loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":97:28)
509
+ #loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":98:27)
510
+ #loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":99:27)
511
+ #loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":104:24)
512
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":107:24)
513
+ #loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":107:45)
514
+ #loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":107:36)
515
+ #loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":108:47)
516
+ #loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":111:12)
517
+ #loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":112:12)
518
+ #loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":113:12)
519
+ #loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":143:97)
520
+ #loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":144:23)
521
+ #loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":144:46)
522
+ #loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":144:33)
523
+ #loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":284:27)
524
+ #loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":146:101)
525
+ #loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":284:38)
526
+ #loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":284:20)
527
+ #loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":284:56)
528
+ #loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":284:49)
529
+ #loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":292:52)
530
+ #loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":292:23)
531
+ #loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":151:26)
532
+ #loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":152:23)
533
+ #loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":152:37)
534
+ #loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":153:42)
535
+ #loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":153:28)
536
+ #loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":154:45)
537
+ #loc32 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22)
538
+ #loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":154:92)
539
+ #loc34 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28)
540
+ #loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":154:102)
541
+ #loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":154:65)
542
+ #loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":159:37)
543
+ #loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":159:24)
544
+ #loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":167:48)
545
+ #loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":347:107)
546
+ #loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":257:21)
547
+ #loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":358:36)
548
+ #loc45 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":359:36)
549
+ #loc46 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":372:22)
550
+ #loc47 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":374:23)
551
+ #loc48 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":375:22)
552
+ #loc49 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":376:23)
553
+ #loc50 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":378:22)
554
+ #loc51 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:70)
555
+ #loc52 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:79)
556
+ #loc53 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:91)
557
+ #loc54 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:99)
558
+ #loc55 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:102)
559
+ #loc56 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:119)
560
+ #loc57 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":386:25)
561
+ #loc58 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":421:107)
562
+ #loc59 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":484:40)
563
+ #loc60 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":346:35)
564
+ #loc61 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":342:32)
565
+ #loc62 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":351:19)
566
+ #loc63 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":349:17)
567
+ #loc64 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":353:14)
568
+ #loc65 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":367:44)
569
+ #loc66 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":367:69)
570
+ #loc67 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":377:22)
571
+ #loc68 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":379:24)
572
+ #loc69 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":380:23)
573
+ #loc70 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:70)
574
+ #loc71 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:79)
575
+ #loc72 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:91)
576
+ #loc73 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:99)
577
+ #loc74 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:102)
578
+ #loc75 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:119)
579
+ #loc76 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":387:24)
580
+ #loc77 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":388:23)
581
+ #loc78 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":389:23)
582
+ #loc79 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":394:73)
583
+ #loc80 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":396:69)
584
+ #loc81 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":399:27)
585
+ #loc82 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40)
586
+ #loc84 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27)
587
+ #loc85 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":403:27)
588
+ #loc86 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":405:35)
589
+ #loc87 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":406:51)
590
+ #loc88 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":410:31)
591
+ #loc89 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":410:25)
592
+ #loc90 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":411:51)
593
+ #loc91 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":411:39)
594
+ #loc92 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":411:21)
595
+ #loc93 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":416:16)
596
+ #loc94 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
597
+ #loc96 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
598
+ #loc97 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":416:24)
599
+ #loc98 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":418:22)
600
+ #loc99 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":418:16)
601
+ #loc100 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":422:22)
602
+ #loc101 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":422:44)
603
+ #loc102 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":530:26)
604
+ #loc103 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":247:33)
605
+ #loc104 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":527:63)
606
+ #loc105 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":248:38)
607
+ #loc106 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":248:24)
608
+ #loc107 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":249:109)
609
+ #loc108 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":249:113)
610
+ #loc109 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":249:55)
611
+ #loc110 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":249:25)
612
+ #loc111 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":250:30)
613
+ #loc112 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":250:35)
614
+ #loc113 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":250:60)
615
+ #loc114 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":251:34)
616
+ #loc115 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":251:48)
617
+ #loc116 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":251:63)
618
+ #loc117 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":252:29)
619
+ #loc118 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":252:47)
620
+ #loc119 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":252:61)
621
+ #loc120 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":252:42)
622
+ #loc121 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":531:21)
623
+ #loc122 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":181:35)
624
+ #loc123 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":182:27)
625
+ #loc124 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":182:41)
626
+ #loc125 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":183:51)
627
+ #loc126 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":183:32)
628
+ #loc127 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":184:49)
629
+ #loc128 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":184:69)
630
+ #loc129 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":186:28)
631
+ #loc130 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":193:52)
632
+ #loc132 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":206:26)
633
+ #loc133 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":206:34)
634
+ #loc134 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":208:20)
635
+ #loc135 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":208:16)
636
+ #loc136 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":214:38)
637
+ #loc137 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":214:30)
638
+ #loc138 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":218:49)
639
+ #loc139 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":218:62)
640
+ #loc140 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":218:25)
641
+ #loc141 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":218:92)
642
+ #loc142 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":221:26)
643
+ #loc143 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":221:31)
644
+ #loc144 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":222:32)
645
+ #loc145 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":222:23)
646
+ #loc146 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":222:40)
647
+ #loc147 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":223:33)
648
+ #loc148 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":223:20)
649
+ #loc149 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":227:48)
650
+ #loc150 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":227:29)
651
+ #loc151 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":229:4)
652
+ #loc167 = loc("q_start"(#loc3))
653
+ #loc168 = loc("off_zq"(#loc4))
654
+ #loc169 = loc("off_hq"(#loc5))
655
+ #loc170 = loc("off_hkv"(#loc6))
656
+ #loc171 = loc("q_offset"(#loc7))
657
+ #loc172 = loc("q_offset"(#loc8))
658
+ #loc173 = loc("q_offset"(#loc9))
659
+ #loc174 = loc("k_offset"(#loc10))
660
+ #loc175 = loc("Q"(#loc11))
661
+ #loc176 = loc("K"(#loc12))
662
+ #loc177 = loc("V"(#loc13))
663
+ #loc178 = loc("sparse_kv_idx_offset"(#loc14))
664
+ #loc179 = loc("offs_m"(#loc15))
665
+ #loc180 = loc("offs_m"(#loc16))
666
+ #loc181 = loc("offs_m"(#loc17))
667
+ #loc182 = loc("ptr"(#loc18))
668
+ #loc183 = loc("q"(#loc19))
669
+ #loc184 = loc("ptr"(#loc20))
670
+ #loc185 = loc("ptr"(#loc21))
671
+ #loc186 = loc("ptr"(#loc22))
672
+ #loc187 = loc("ptr"(#loc23))
673
+ #loc188 = loc("kv_indices"(#loc26))
674
+ #loc189 = loc("kv_start"(#loc27))
675
+ #loc190 = loc("kv_start"(#loc28))
676
+ #loc191 = loc("kv_num_blocks"(#loc29))
677
+ #loc192 = loc("kv_num_blocks"(#loc30))
678
+ #loc193 = loc("block_n_end"(#loc31))
679
+ #loc194 = loc("block_n_end"(#loc33))
680
+ #loc195 = loc("block_n_end"(#loc35))
681
+ #loc196 = loc("block_n_end"(#loc36))
682
+ #loc197 = loc("offs_n"(#loc37))
683
+ #loc198 = loc("offs_n"(#loc38))
684
+ #loc199 = loc("k"(#loc40))
685
+ #loc201 = loc("m"(#loc44))
686
+ #loc202 = loc("n"(#loc45))
687
+ #loc203 = loc("tmp3"(#loc46))
688
+ #loc204 = loc("tmp5"(#loc47))
689
+ #loc205 = loc("tmp6"(#loc48))
690
+ #loc206 = loc("tmp7"(#loc49))
691
+ #loc207 = loc("tmp9"(#loc50))
692
+ #loc208 = loc("tmp14"(#loc51))
693
+ #loc209 = loc("tmp14"(#loc52))
694
+ #loc210 = loc("tmp14"(#loc53))
695
+ #loc211 = loc("tmp14"(#loc54))
696
+ #loc212 = loc("tmp14"(#loc55))
697
+ #loc213 = loc("tmp14"(#loc56))
698
+ #loc214 = loc("tmp17"(#loc57))
699
+ #loc215 = loc("v"(#loc58))
700
+ #loc216 = loc("acc"(#loc59))
701
+ #loc217 = loc("offs_n_load"(#loc60))
702
+ #loc218 = loc("kv_base_offset"(#loc61))
703
+ #loc219 = loc("qk"(#loc62))
704
+ #loc220 = loc("k"(#loc63))
705
+ #loc221 = loc("qk"(#loc64))
706
+ #loc222 = loc("post_mod_scores"(#loc65))
707
+ #loc223 = loc("post_mod_scores"(#loc66))
708
+ #loc224 = loc("tmp8"(#loc67))
709
+ #loc225 = loc("tmp10"(#loc68))
710
+ #loc226 = loc("tmp11"(#loc69))
711
+ #loc227 = loc("tmp16"(#loc70))
712
+ #loc228 = loc("tmp16"(#loc71))
713
+ #loc229 = loc("tmp16"(#loc72))
714
+ #loc230 = loc("tmp16"(#loc73))
715
+ #loc231 = loc("tmp16"(#loc74))
716
+ #loc232 = loc("tmp16"(#loc75))
717
+ #loc233 = loc("tmp18"(#loc76))
718
+ #loc234 = loc("tmp19"(#loc77))
719
+ #loc235 = loc("tmp20"(#loc78))
720
+ #loc236 = loc("mask_mod_output"(#loc79))
721
+ #loc237 = loc("post_mod_scores"(#loc80))
722
+ #loc238 = loc("post_mod_scores"(#loc81))
723
+ #loc240 = loc("m_ij"(#loc85))
724
+ #loc241 = loc("masked_out_rows"(#loc86))
725
+ #loc242 = loc("m_ij_masked"(#loc87))
726
+ #loc243 = loc("alpha"(#loc88))
727
+ #loc244 = loc("alpha"(#loc89))
728
+ #loc245 = loc("p"(#loc90))
729
+ #loc246 = loc("p"(#loc91))
730
+ #loc247 = loc("p"(#loc92))
731
+ #loc248 = loc("l_i"(#loc93))
732
+ #loc250 = loc("l_i"(#loc97))
733
+ #loc251 = loc("acc"(#loc98))
734
+ #loc252 = loc("acc"(#loc99))
735
+ #loc253 = loc("acc"(#loc100))
736
+ #loc254 = loc("acc"(#loc101))
737
+ #loc255 = loc("offs_n"(#loc102))
738
+ #loc256 = loc("cur_block_idx"(#loc103))
739
+ #loc257 = loc("offset"(#loc104))
740
+ #loc258 = loc("cur_block"(#loc105))
741
+ #loc259 = loc("cur_block"(#loc106))
742
+ #loc260 = loc("next_block"(#loc107))
743
+ #loc261 = loc("next_block"(#loc108))
744
+ #loc262 = loc("next_block"(#loc109))
745
+ #loc263 = loc("next_block"(#loc110))
746
+ #loc264 = loc("needs_jump"(#loc111))
747
+ #loc265 = loc("needs_jump"(#loc112))
748
+ #loc266 = loc("needs_jump"(#loc113))
749
+ #loc267 = loc("jump_to_block"(#loc114))
750
+ #loc268 = loc("jump_to_block"(#loc115))
751
+ #loc269 = loc("jump_to_block"(#loc116))
752
+ #loc270 = loc("offset"(#loc117))
753
+ #loc271 = loc("offset"(#loc118))
754
+ #loc272 = loc("offset"(#loc119))
755
+ #loc273 = loc("offset"(#loc120))
756
+ #loc274 = loc("kv_offset"(#loc121))
757
+ #loc275 = loc("kv_indices"(#loc122))
758
+ #loc276 = loc("kv_start"(#loc123))
759
+ #loc277 = loc("kv_start"(#loc124))
760
+ #loc278 = loc("kv_num_blocks"(#loc125))
761
+ #loc279 = loc("kv_num_blocks"(#loc126))
762
+ #loc280 = loc("block_n_end"(#loc127))
763
+ #loc281 = loc("block_n_end"(#loc128))
764
+ #loc282 = loc("offs_n"(#loc129))
765
+ #loc284 = loc("l_i"(#loc132))
766
+ #loc285 = loc("l_i"(#loc133))
767
+ #loc286 = loc("acc"(#loc134))
768
+ #loc287 = loc("acc"(#loc135))
769
+ #loc288 = loc("mask"(#loc136))
770
+ #loc289 = loc("mask"(#loc137))
771
+ #loc290 = loc("off_hz"(#loc142))
772
+ #loc291 = loc("off_hz"(#loc143))
773
+ #loc292 = loc("l_ptrs"(#loc144))
774
+ #loc293 = loc("l_ptrs"(#loc145))
775
+ #loc294 = loc("l_ptrs"(#loc146))
776
+ #loc295 = loc("lse"(#loc147))
777
+ #loc296 = loc("lse"(#loc148))
778
+ #loc297 = loc(callsite(#loc182 at #loc183))
779
+ #loc298 = loc(callsite(#loc184 at #loc183))
780
+ #loc299 = loc(callsite(#loc185 at #loc183))
781
+ #loc300 = loc(callsite(#loc186 at #loc183))
782
+ #loc301 = loc(callsite(#loc187 at #loc183))
783
+ #loc302 = loc(callsite(#loc24 at #loc183))
784
+ #loc303 = loc(callsite(#loc25 at #loc183))
785
+ #loc304 = loc(callsite(#loc32 at #loc194))
786
+ #loc305 = loc(callsite(#loc34 at #loc194))
787
+ #loc306 = loc(callsite(#loc199 at #loc200))
788
+ #loc307 = loc(callsite(#loc201 at #loc200))
789
+ #loc308 = loc(callsite(#loc202 at #loc200))
790
+ #loc309 = loc(callsite(#loc203 at #loc200))
791
+ #loc310 = loc(callsite(#loc204 at #loc200))
792
+ #loc311 = loc(callsite(#loc205 at #loc200))
793
+ #loc312 = loc(callsite(#loc206 at #loc200))
794
+ #loc313 = loc(callsite(#loc207 at #loc200))
795
+ #loc314 = loc(callsite(#loc208 at #loc200))
796
+ #loc315 = loc(callsite(#loc209 at #loc200))
797
+ #loc316 = loc(callsite(#loc210 at #loc200))
798
+ #loc317 = loc(callsite(#loc211 at #loc200))
799
+ #loc318 = loc(callsite(#loc212 at #loc200))
800
+ #loc319 = loc(callsite(#loc213 at #loc200))
801
+ #loc320 = loc(callsite(#loc214 at #loc200))
802
+ #loc321 = loc(callsite(#loc215 at #loc200))
803
+ #loc322 = loc("l_i"(#loc216))
804
+ #loc323 = loc(callsite(#loc217 at #loc200))
805
+ #loc324 = loc(callsite(#loc218 at #loc200))
806
+ #loc325 = loc(callsite(#loc219 at #loc200))
807
+ #loc326 = loc(callsite(#loc220 at #loc200))
808
+ #loc327 = loc(callsite(#loc221 at #loc200))
809
+ #loc328 = loc(callsite(#loc222 at #loc200))
810
+ #loc329 = loc(callsite(#loc223 at #loc200))
811
+ #loc330 = loc(callsite(#loc224 at #loc200))
812
+ #loc331 = loc(callsite(#loc225 at #loc200))
813
+ #loc332 = loc(callsite(#loc226 at #loc200))
814
+ #loc333 = loc(callsite(#loc227 at #loc200))
815
+ #loc334 = loc(callsite(#loc228 at #loc200))
816
+ #loc335 = loc(callsite(#loc229 at #loc200))
817
+ #loc336 = loc(callsite(#loc230 at #loc200))
818
+ #loc337 = loc(callsite(#loc231 at #loc200))
819
+ #loc338 = loc(callsite(#loc232 at #loc200))
820
+ #loc339 = loc(callsite(#loc233 at #loc200))
821
+ #loc340 = loc(callsite(#loc234 at #loc200))
822
+ #loc341 = loc(callsite(#loc235 at #loc200))
823
+ #loc342 = loc(callsite(#loc236 at #loc200))
824
+ #loc343 = loc(callsite(#loc237 at #loc200))
825
+ #loc344 = loc(callsite(#loc238 at #loc200))
826
+ #loc346 = loc(callsite(#loc240 at #loc200))
827
+ #loc347 = loc(callsite(#loc241 at #loc200))
828
+ #loc348 = loc(callsite(#loc242 at #loc200))
829
+ #loc349 = loc(callsite(#loc243 at #loc200))
830
+ #loc350 = loc(callsite(#loc244 at #loc200))
831
+ #loc351 = loc(callsite(#loc245 at #loc200))
832
+ #loc352 = loc(callsite(#loc246 at #loc200))
833
+ #loc353 = loc(callsite(#loc247 at #loc200))
834
+ #loc354 = loc(callsite(#loc248 at #loc200))
835
+ #loc356 = loc(callsite(#loc250 at #loc200))
836
+ #loc357 = loc(callsite(#loc251 at #loc200))
837
+ #loc358 = loc(callsite(#loc252 at #loc200))
838
+ #loc359 = loc(callsite(#loc253 at #loc200))
839
+ #loc360 = loc(callsite(#loc254 at #loc200))
840
+ #loc361 = loc(callsite(#loc255 at #loc42))
841
+ #loc362 = loc(callsite(#loc257 at #loc42))
842
+ #loc363 = loc(callsite(#loc274 at #loc42))
843
+ #loc364 = loc(callsite(#loc199 at #loc283))
844
+ #loc365 = loc(callsite(#loc215 at #loc283))
845
+ #loc366 = loc(callsite(#loc217 at #loc283))
846
+ #loc367 = loc(callsite(#loc218 at #loc283))
847
+ #loc368 = loc(callsite(#loc219 at #loc283))
848
+ #loc369 = loc(callsite(#loc220 at #loc283))
849
+ #loc370 = loc(callsite(#loc221 at #loc283))
850
+ #loc371 = loc(callsite(#loc222 at #loc283))
851
+ #loc372 = loc(callsite(#loc223 at #loc283))
852
+ #loc373 = loc(callsite(#loc238 at #loc283))
853
+ #loc375 = loc(callsite(#loc240 at #loc283))
854
+ #loc376 = loc(callsite(#loc241 at #loc283))
855
+ #loc377 = loc(callsite(#loc242 at #loc283))
856
+ #loc378 = loc(callsite(#loc243 at #loc283))
857
+ #loc379 = loc(callsite(#loc244 at #loc283))
858
+ #loc380 = loc(callsite(#loc245 at #loc283))
859
+ #loc381 = loc(callsite(#loc246 at #loc283))
860
+ #loc382 = loc(callsite(#loc247 at #loc283))
861
+ #loc383 = loc(callsite(#loc248 at #loc283))
862
+ #loc385 = loc(callsite(#loc250 at #loc283))
863
+ #loc386 = loc(callsite(#loc251 at #loc283))
864
+ #loc387 = loc(callsite(#loc252 at #loc283))
865
+ #loc388 = loc(callsite(#loc253 at #loc283))
866
+ #loc389 = loc(callsite(#loc254 at #loc283))
867
+ #loc390 = loc(callsite(#loc255 at #loc131))
868
+ #loc391 = loc(callsite(#loc257 at #loc131))
869
+ #loc392 = loc(callsite(#loc274 at #loc131))
870
+ #loc393 = loc(callsite(#loc185 at #loc306))
871
+ #loc394 = loc(callsite(#loc187 at #loc306))
872
+ #loc395 = loc(callsite(#loc24 at #loc306))
873
+ #loc396 = loc(callsite(#loc43 at #loc307))
874
+ #loc397 = loc(callsite(#loc43 at #loc308))
875
+ #loc398 = loc(callsite(#loc185 at #loc321))
876
+ #loc399 = loc(callsite(#loc25 at #loc306))
877
+ #loc400 = loc(callsite(#loc25 at #loc321))
878
+ #loc401 = loc("m_i"(#loc322))
879
+ #loc402 = loc(callsite(#loc182 at #loc306))
880
+ #loc403 = loc(callsite(#loc184 at #loc306))
881
+ #loc404 = loc(callsite(#loc187 at #loc321))
882
+ #loc405 = loc(callsite(#loc82 at #loc345))
883
+ #loc407 = loc(callsite(#loc94 at #loc355))
884
+ #loc409 = loc(callsite(#loc256 at #loc362))
885
+ #loc410 = loc(callsite(#loc258 at #loc362))
886
+ #loc411 = loc(callsite(#loc259 at #loc362))
887
+ #loc412 = loc(callsite(#loc260 at #loc362))
888
+ #loc413 = loc(callsite(#loc261 at #loc362))
889
+ #loc414 = loc(callsite(#loc262 at #loc362))
890
+ #loc415 = loc(callsite(#loc263 at #loc362))
891
+ #loc416 = loc(callsite(#loc264 at #loc362))
892
+ #loc417 = loc(callsite(#loc265 at #loc362))
893
+ #loc418 = loc(callsite(#loc266 at #loc362))
894
+ #loc419 = loc(callsite(#loc267 at #loc362))
895
+ #loc420 = loc(callsite(#loc268 at #loc362))
896
+ #loc421 = loc(callsite(#loc269 at #loc362))
897
+ #loc422 = loc(callsite(#loc270 at #loc362))
898
+ #loc423 = loc(callsite(#loc271 at #loc362))
899
+ #loc424 = loc(callsite(#loc272 at #loc362))
900
+ #loc425 = loc(callsite(#loc273 at #loc362))
901
+ #loc426 = loc(callsite(#loc25 at #loc364))
902
+ #loc427 = loc(callsite(#loc25 at #loc365))
903
+ #loc428 = loc(callsite(#loc182 at #loc364))
904
+ #loc429 = loc(callsite(#loc184 at #loc364))
905
+ #loc430 = loc(callsite(#loc185 at #loc364))
906
+ #loc431 = loc(callsite(#loc187 at #loc364))
907
+ #loc432 = loc(callsite(#loc24 at #loc364))
908
+ #loc433 = loc(callsite(#loc185 at #loc365))
909
+ #loc434 = loc(callsite(#loc187 at #loc365))
910
+ #loc435 = loc(callsite(#loc82 at #loc374))
911
+ #loc437 = loc(callsite(#loc94 at #loc384))
912
+ #loc439 = loc(callsite(#loc256 at #loc391))
913
+ #loc440 = loc(callsite(#loc258 at #loc391))
914
+ #loc441 = loc(callsite(#loc259 at #loc391))
915
+ #loc442 = loc(callsite(#loc260 at #loc391))
916
+ #loc443 = loc(callsite(#loc261 at #loc391))
917
+ #loc444 = loc(callsite(#loc262 at #loc391))
918
+ #loc445 = loc(callsite(#loc263 at #loc391))
919
+ #loc446 = loc(callsite(#loc264 at #loc391))
920
+ #loc447 = loc(callsite(#loc265 at #loc391))
921
+ #loc448 = loc(callsite(#loc266 at #loc391))
922
+ #loc449 = loc(callsite(#loc267 at #loc391))
923
+ #loc450 = loc(callsite(#loc268 at #loc391))
924
+ #loc451 = loc(callsite(#loc269 at #loc391))
925
+ #loc452 = loc(callsite(#loc270 at #loc391))
926
+ #loc453 = loc(callsite(#loc271 at #loc391))
927
+ #loc454 = loc(callsite(#loc272 at #loc391))
928
+ #loc455 = loc(callsite(#loc273 at #loc391))
929
+ #loc456 = loc("offs_n"(#loc401))
930
+ #loc457 = loc(callsite(#loc84 at #loc405))
931
+ #loc458 = loc(callsite(#loc96 at #loc407))
932
+ #loc459 = loc(callsite(#loc84 at #loc435))
933
+ #loc460 = loc(callsite(#loc96 at #loc437))
934
+ #loc461 = loc("kv_offset"(#loc456))
935
+ #loc462 = loc(callsite(#loc461 at #loc42))
936
+ #loc463 = loc(callsite(#loc461 at #loc131))
progress/SpecForge/cache/compiled_kernels/triton/3/23MFPG6HLDX2565HGAMARD6NR52JWXZFYOVRFUBKYJAOEMI2YQEQ/triton_tem_fused_0.ttir ADDED
@@ -0,0 +1,780 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":18:0)
2
+ #loc1 = loc(unknown)
3
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":172:41)
4
+ #loc48 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":520:16)
5
+ #loc87 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":403:51)
6
+ #loc99 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":416:34)
7
+ #loc137 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":198:45)
8
+ #loc156 = loc("arg_Q"(#loc))
9
+ #loc157 = loc("arg_K"(#loc))
10
+ #loc158 = loc("arg_V"(#loc))
11
+ #loc159 = loc("arg_LSE"(#loc))
12
+ #loc160 = loc("arg_MAX"(#loc))
13
+ #loc161 = loc("arg_KV_NUM_BLKS"(#loc))
14
+ #loc162 = loc("arg_KV_IDX"(#loc))
15
+ #loc163 = loc("arg_FULL_KV_NUM_BLKS"(#loc))
16
+ #loc164 = loc("arg_FULL_KV_IDX"(#loc))
17
+ #loc165 = loc("out_ptr0"(#loc))
18
+ #loc166 = loc("ks0"(#loc))
19
+ #loc167 = loc("ks1"(#loc))
20
+ #loc168 = loc("ks2"(#loc))
21
+ #loc169 = loc("ks3"(#loc))
22
+ #loc170 = loc("ks4"(#loc))
23
+ #loc210 = loc(callsite(#loc48 at #loc2))
24
+ #loc247 = loc("m_ij"(#loc87))
25
+ #loc257 = loc("l_i"(#loc99))
26
+ #loc293 = loc(callsite(#loc48 at #loc137))
27
+ #loc354 = loc(callsite(#loc247 at #loc210))
28
+ #loc364 = loc(callsite(#loc257 at #loc210))
29
+ #loc383 = loc(callsite(#loc247 at #loc293))
30
+ #loc393 = loc(callsite(#loc257 at #loc293))
31
+ #loc413 = loc(callsite(#loc1 at #loc354))
32
+ #loc415 = loc(callsite(#loc1 at #loc364))
33
+ #loc443 = loc(callsite(#loc1 at #loc383))
34
+ #loc445 = loc(callsite(#loc1 at #loc393))
35
+ module {
36
+ tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_MAX: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("arg_MAX"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc)), %ks2: i32 loc("ks2"(#loc)), %ks3: i32 loc("ks3"(#loc)), %ks4: i32 loc("ks4"(#loc))) attributes {noinline = false} {
37
+ %cst = arith.constant dense<1024> : tensor<64x1xi32> loc(#loc1)
38
+ %c0_i32 = arith.constant 0 : i32 loc(#loc1)
39
+ %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x128xbf16> loc(#loc1)
40
+ %cst_1 = arith.constant dense<16> : tensor<1x64xi32> loc(#loc171)
41
+ %cst_2 = arith.constant dense<16> : tensor<128x1xi32> loc(#loc171)
42
+ %cst_3 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc1)
43
+ %cst_4 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc1)
44
+ %cst_5 = arith.constant dense<false> : tensor<128x64xi1> loc(#loc171)
45
+ %cst_6 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc171)
46
+ %cst_7 = arith.constant dense<1> : tensor<128x1xi32> loc(#loc171)
47
+ %cst_8 = arith.constant dense<0> : tensor<128x1xi32> loc(#loc171)
48
+ %cst_9 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc171)
49
+ %cst_10 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc1)
50
+ %cst_11 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc1)
51
+ %cst_12 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1)
52
+ %c63_i32 = arith.constant 63 : i32 loc(#loc1)
53
+ %c64_i32 = arith.constant 64 : i32 loc(#loc1)
54
+ %q = arith.constant dense<0.000000e+00> : tensor<128x128xbf16> loc(#loc306)
55
+ %acc = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc307)
56
+ %cst_13 = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc1)
57
+ %mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc174)
58
+ %l_i = arith.constant dense<1.000000e+00> : tensor<128xf32> loc(#loc175)
59
+ %cst_14 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc1)
60
+ %c2_i32 = arith.constant 2 : i32 loc(#loc1)
61
+ %c4_i32 = arith.constant 4 : i32 loc(#loc1)
62
+ %HQ = arith.constant 32 : i32 loc(#loc176)
63
+ %c1_i32 = arith.constant 1 : i32 loc(#loc1)
64
+ %c128_i32 = arith.constant 128 : i32 loc(#loc1)
65
+ %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
66
+ %0 = arith.muli %ks0, %c4096_i32 : i32 loc(#loc10)
67
+ %q_start = tt.get_program_id x : i32 loc(#loc177)
68
+ %off_zq = tt.get_program_id y : i32 loc(#loc178)
69
+ %off_hq = tt.get_program_id z : i32 loc(#loc179)
70
+ %off_hkv = arith.divsi %off_hq, %c4_i32 : i32 loc(#loc180)
71
+ %q_offset = arith.muli %off_zq, %0 : i32 loc(#loc181)
72
+ %q_offset_15 = arith.muli %off_hq, %c128_i32 : i32 loc(#loc182)
73
+ %q_offset_16 = arith.addi %q_offset, %q_offset_15 : i32 loc(#loc183)
74
+ %k_offset = arith.muli %off_hkv, %c128_i32 : i32 loc(#loc184)
75
+ %Q = tt.addptr %arg_Q, %q_offset_16 : !tt.ptr<bf16>, i32 loc(#loc185)
76
+ %K = tt.addptr %arg_K, %k_offset : !tt.ptr<bf16>, i32 loc(#loc186)
77
+ %V = tt.addptr %arg_V, %k_offset : !tt.ptr<bf16>, i32 loc(#loc187)
78
+ %sparse_kv_idx_offset = arith.muli %q_start, %ks4 : i32 loc(#loc188)
79
+ %offs_m = arith.muli %q_start, %c128_i32 : i32 loc(#loc189)
80
+ %offs_m_17 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc190)
81
+ %offs_m_18 = tt.splat %offs_m : i32 -> tensor<128xi32> loc(#loc191)
82
+ %offs_m_19 = arith.addi %offs_m_18, %offs_m_17 : tensor<128xi32> loc(#loc191)
83
+ %ptr = tt.expand_dims %offs_m_19 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc308)
84
+ %ptr_20 = arith.muli %ptr, %cst_13 : tensor<128x1xi32> loc(#loc309)
85
+ %ptr_21 = tt.splat %Q : !tt.ptr<bf16> -> tensor<128x1x!tt.ptr<bf16>> loc(#loc310)
86
+ %ptr_22 = tt.addptr %ptr_21, %ptr_20 : tensor<128x1x!tt.ptr<bf16>>, tensor<128x1xi32> loc(#loc310)
87
+ %ptr_23 = tt.expand_dims %offs_m_17 {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc311)
88
+ %ptr_24 = tt.broadcast %ptr_22 : tensor<128x1x!tt.ptr<bf16>> -> tensor<128x128x!tt.ptr<bf16>> loc(#loc312)
89
+ %ptr_25 = tt.broadcast %ptr_23 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc312)
90
+ %ptr_26 = tt.addptr %ptr_24, %ptr_25 : tensor<128x128x!tt.ptr<bf16>>, tensor<128x128xi32> loc(#loc312)
91
+ %q_27 = tt.splat %ks0 : i32 -> tensor<128x1xi32> loc(#loc313)
92
+ %q_28 = arith.cmpi slt, %ptr, %q_27 : tensor<128x1xi32> loc(#loc313)
93
+ %q_29 = tt.broadcast %q_28 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc306)
94
+ %q_30 = tt.load %ptr_26, %q_29, %q : tensor<128x128x!tt.ptr<bf16>> loc(#loc306)
95
+ %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset : !tt.ptr<i32>, i32 loc(#loc197)
96
+ %kv_start = tt.load %kv_indices : !tt.ptr<i32> loc(#loc198)
97
+ %kv_start_31 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc199)
98
+ %kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %q_start : !tt.ptr<i32>, i32 loc(#loc200)
99
+ %kv_num_blocks_32 = tt.load %kv_num_blocks : !tt.ptr<i32> loc(#loc201)
100
+ %block_n_end = arith.muli %kv_num_blocks_32, %c2_i32 : i32 loc(#loc202)
101
+ %block_n_end_33 = arith.addi %ks1, %c63_i32 : i32 loc(#loc314)
102
+ %block_n_end_34 = arith.divsi %block_n_end_33, %c64_i32 : i32 loc(#loc315)
103
+ %block_n_end_35 = arith.maxsi %block_n_end_34, %c1_i32 : i32 loc(#loc204)
104
+ %block_n_end_36 = arith.minsi %block_n_end, %block_n_end_35 : i32 loc(#loc205)
105
+ %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc206)
106
+ %offs_n_37 = tt.splat %kv_start_31 : i32 -> tensor<64xi32> loc(#loc207)
107
+ %offs_n_38 = arith.addi %offs_n_37, %offs_n : tensor<64xi32> loc(#loc207)
108
+ %1 = tt.expand_dims %offs_n_38 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc45)
109
+ %kv_offset:5 = scf.for %start_n = %c0_i32 to %block_n_end_36 step %c1_i32 iter_args(%acc_62 = %acc, %l_i_63 = %cst_14, %m_i = %cst_3, %offs_n_64 = %1, %kv_offset_65 = %c0_i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 {
110
+ %kv_base_offset = arith.addi %kv_start_31, %kv_offset_65 : i32 loc(#loc317)
111
+ %offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc318)
112
+ %offs_n_load_66 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc318)
113
+ %ptr_67 = tt.expand_dims %offs_n_load_66 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc404)
114
+ %ptr_68 = arith.muli %ptr_67, %cst : tensor<64x1xi32> loc(#loc405)
115
+ %ptr_69 = tt.splat %K : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc406)
116
+ %ptr_70 = tt.addptr %ptr_69, %ptr_68 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc406)
117
+ %ptr_71 = tt.broadcast %ptr_70 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc407)
118
+ %ptr_72 = tt.broadcast %ptr_23 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc407)
119
+ %ptr_73 = tt.addptr %ptr_71, %ptr_72 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc407)
120
+ %k = tt.splat %ks1 : i32 -> tensor<64x1xi32> loc(#loc408)
121
+ %k_74 = arith.cmpi slt, %ptr_67, %k : tensor<64x1xi32> loc(#loc408)
122
+ %k_75 = tt.broadcast %k_74 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc409)
123
+ %k_76 = tt.load %ptr_73, %k_75, %cst_0 : tensor<64x128x!tt.ptr<bf16>> loc(#loc409)
124
+ %k_77 = tt.trans %k_76 {order = array<i32: 1, 0>} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc320)
125
+ %qk = tt.dot %q_30, %k_77, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc321)
126
+ %qk_78 = arith.mulf %qk, %cst_11 : tensor<128x64xf32> loc(#loc322)
127
+ %m = arith.remsi %ptr, %q_27 : tensor<128x1xi32> loc(#loc410)
128
+ %n = tt.splat %ks1 : i32 -> tensor<1x64xi32> loc(#loc411)
129
+ %n_79 = arith.remsi %offs_n_64, %n : tensor<1x64xi32> loc(#loc411)
130
+ %post_mod_scores = arith.cmpi slt, %offs_n_64, %n : tensor<1x64xi32> loc(#loc325)
131
+ %post_mod_scores_80 = tt.broadcast %post_mod_scores : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc326)
132
+ %post_mod_scores_81 = arith.select %post_mod_scores_80, %qk_78, %cst_10 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc326)
133
+ %tmp3 = arith.cmpi slt, %m, %cst_8 : tensor<128x1xi32> loc(#loc327)
134
+ %tmp5 = tt.broadcast %n_79 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc328)
135
+ %tmp5_82 = tt.broadcast %m : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc328)
136
+ %tmp5_83 = arith.cmpi sle, %tmp5, %tmp5_82 : tensor<128x64xi32> loc(#loc328)
137
+ %tmp6 = tt.broadcast %tmp3 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc329)
138
+ %tmp6_84 = arith.andi %tmp6, %tmp5_83 : tensor<128x64xi1> loc(#loc329)
139
+ %tmp7 = arith.cmpi sge, %m, %cst_8 : tensor<128x1xi32> loc(#loc330)
140
+ %tmp8 = arith.cmpi slt, %n_79, %cst_9 : tensor<1x64xi32> loc(#loc331)
141
+ %tmp9 = tt.broadcast %tmp7 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc332)
142
+ %tmp9_85 = tt.broadcast %tmp8 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc332)
143
+ %tmp9_86 = arith.andi %tmp9, %tmp9_85 : tensor<128x64xi1> loc(#loc332)
144
+ %tmp10 = arith.extui %tmp8 : tensor<1x64xi1> to tensor<1x64xi32> loc(#loc333)
145
+ %tmp10_87 = arith.cmpi eq, %tmp10, %cst_9 : tensor<1x64xi32> loc(#loc333)
146
+ %tmp11 = tt.broadcast %tmp10_87 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc334)
147
+ %tmp11_88 = arith.andi %tmp9, %tmp11 : tensor<128x64xi1> loc(#loc334)
148
+ %tmp14 = arith.remsi %m, %cst_2 : tensor<128x1xi32> loc(#loc335)
149
+ %tmp14_89 = arith.cmpi ne, %tmp14, %cst_8 : tensor<128x1xi32> loc(#loc336)
150
+ %tmp14_90 = arith.divsi %m, %cst_2 : tensor<128x1xi32> loc(#loc337)
151
+ %tmp14_91 = arith.subi %tmp14_90, %cst_7 : tensor<128x1xi32> loc(#loc338)
152
+ %tmp14_92 = arith.select %tmp14_89, %tmp14_91, %tmp14_90 : tensor<128x1xi1>, tensor<128x1xi32> loc(#loc339)
153
+ %tmp14_93 = arith.select %tmp3, %tmp14_92, %tmp14_90 : tensor<128x1xi1>, tensor<128x1xi32> loc(#loc340)
154
+ %tmp16 = arith.remsi %n_79, %cst_1 : tensor<1x64xi32> loc(#loc341)
155
+ %tmp16_94 = arith.cmpi ne, %tmp16, %cst_9 : tensor<1x64xi32> loc(#loc342)
156
+ %tmp16_95 = arith.divsi %n_79, %cst_1 : tensor<1x64xi32> loc(#loc343)
157
+ %tmp16_96 = arith.subi %tmp16_95, %cst_6 : tensor<1x64xi32> loc(#loc344)
158
+ %tmp16_97 = arith.select %tmp16_94, %tmp16_96, %tmp16_95 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc345)
159
+ %tmp16_98 = arith.select %tmp8, %tmp16_97, %tmp16_95 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc346)
160
+ %tmp17 = tt.broadcast %tmp14_93 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc347)
161
+ %tmp17_99 = tt.broadcast %tmp16_98 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc347)
162
+ %tmp17_100 = arith.cmpi eq, %tmp17, %tmp17_99 : tensor<128x64xi32> loc(#loc347)
163
+ %tmp18 = arith.andi %tmp11_88, %tmp17_100 : tensor<128x64xi1> loc(#loc348)
164
+ %tmp19 = arith.ori %tmp9_86, %tmp18 : tensor<128x64xi1> loc(#loc349)
165
+ %tmp20 = arith.ori %tmp6_84, %tmp19 : tensor<128x64xi1> loc(#loc350)
166
+ %mask_mod_output = arith.select %post_mod_scores_80, %tmp20, %cst_5 : tensor<128x64xi1>, tensor<128x64xi1> loc(#loc351)
167
+ %post_mod_scores_101 = arith.select %mask_mod_output, %post_mod_scores_81, %cst_10 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc352)
168
+ %post_mod_scores_102 = arith.mulf %post_mod_scores_101, %cst_4 : tensor<128x64xf32> loc(#loc353)
169
+ %m_ij = "tt.reduce"(%post_mod_scores_102) <{axis = 1 : i32}> ({
170
+ ^bb0(%m_ij_135: f32 loc(callsite(#loc1 at #loc354)), %m_ij_136: f32 loc(callsite(#loc1 at #loc354))):
171
+ %m_ij_137 = arith.maxnumf %m_ij_135, %m_ij_136 : f32 loc(#loc467)
172
+ tt.reduce.return %m_ij_137 : f32 loc(#loc412)
173
+ }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc412)
174
+ %m_ij_103 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc355)
175
+ %masked_out_rows = arith.cmpf oeq, %m_ij_103, %cst_3 : tensor<128xf32> loc(#loc356)
176
+ %m_ij_masked = arith.select %masked_out_rows, %cst_14, %m_ij_103 : tensor<128xi1>, tensor<128xf32> loc(#loc357)
177
+ %alpha = arith.subf %m_i, %m_ij_masked : tensor<128xf32> loc(#loc358)
178
+ %alpha_104 = math.exp2 %alpha : tensor<128xf32> loc(#loc359)
179
+ %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc360)
180
+ %p_105 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc361)
181
+ %p_106 = arith.subf %post_mod_scores_102, %p_105 : tensor<128x64xf32> loc(#loc361)
182
+ %p_107 = math.exp2 %p_106 : tensor<128x64xf32> loc(#loc362)
183
+ %l_i_108 = arith.mulf %l_i_63, %alpha_104 : tensor<128xf32> loc(#loc363)
184
+ %l_i_109 = "tt.reduce"(%p_107) <{axis = 1 : i32}> ({
185
+ ^bb0(%l_i_135: f32 loc(callsite(#loc1 at #loc364)), %l_i_136: f32 loc(callsite(#loc1 at #loc364))):
186
+ %l_i_137 = arith.addf %l_i_135, %l_i_136 : f32 loc(#loc468)
187
+ tt.reduce.return %l_i_137 : f32 loc(#loc414)
188
+ }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc414)
189
+ %l_i_110 = arith.addf %l_i_108, %l_i_109 : tensor<128xf32> loc(#loc365)
190
+ %acc_111 = tt.expand_dims %alpha_104 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc366)
191
+ %acc_112 = tt.broadcast %acc_111 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc367)
192
+ %acc_113 = arith.mulf %acc_62, %acc_112 : tensor<128x128xf32> loc(#loc367)
193
+ %ptr_114 = tt.splat %V : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc416)
194
+ %ptr_115 = tt.addptr %ptr_114, %ptr_68 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc416)
195
+ %ptr_116 = tt.broadcast %ptr_115 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc417)
196
+ %ptr_117 = tt.addptr %ptr_116, %ptr_72 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc417)
197
+ %v = tt.load %ptr_117, %k_75, %cst_0 : tensor<64x128x!tt.ptr<bf16>> loc(#loc418)
198
+ %acc_118 = arith.truncf %p_107 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc369)
199
+ %acc_119 = tt.dot %acc_118, %v, %acc_113, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc370)
200
+ %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc419)
201
+ %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr<i32>, i32 loc(#loc420)
202
+ %cur_block_120 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc421)
203
+ %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc422)
204
+ %next_block_121 = arith.cmpi slt, %next_block, %kv_num_blocks_32 : i32 loc(#loc423)
205
+ %next_block_122 = tt.addptr %cur_block, %c1_i32 : !tt.ptr<i32>, i32 loc(#loc424)
206
+ %next_block_123 = tt.load %next_block_122, %next_block_121 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc425)
207
+ %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc426)
208
+ %needs_jump_124 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc427)
209
+ %needs_jump_125 = arith.cmpi eq, %needs_jump_124, %c0_i32 : i32 loc(#loc428)
210
+ %jump_to_block = arith.subi %next_block_123, %cur_block_120 : i32 loc(#loc429)
211
+ %jump_to_block_126 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc430)
212
+ %jump_to_block_127 = arith.subi %jump_to_block_126, %c64_i32 : i32 loc(#loc431)
213
+ %offset = arith.extui %needs_jump_125 : i1 to i32 loc(#loc432)
214
+ %offset_128 = arith.muli %jump_to_block_127, %offset : i32 loc(#loc432)
215
+ %offset_129 = arith.subi %c1_i32, %offset : i32 loc(#loc433)
216
+ %offset_130 = arith.muli %offset_129, %c64_i32 : i32 loc(#loc434)
217
+ %offset_131 = arith.addi %offset_128, %offset_130 : i32 loc(#loc435)
218
+ %offs_n_132 = tt.splat %offset_131 : i32 -> tensor<1x64xi32> loc(#loc372)
219
+ %offs_n_133 = arith.addi %offs_n_64, %offs_n_132 : tensor<1x64xi32> loc(#loc372)
220
+ %kv_offset_134 = arith.addi %kv_offset_65, %offset_131 : i32 loc(#loc373)
221
+ scf.yield %acc_119, %l_i_110, %m_ij_103, %offs_n_133, %kv_offset_134 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc284)
222
+ } loc(#loc472)
223
+ %kv_indices_39 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset : !tt.ptr<i32>, i32 loc(#loc285)
224
+ %kv_start_40 = tt.load %kv_indices_39 : !tt.ptr<i32> loc(#loc286)
225
+ %kv_start_41 = arith.muli %kv_start_40, %c128_i32 : i32 loc(#loc287)
226
+ %kv_num_blocks_42 = tt.addptr %arg_FULL_KV_NUM_BLKS, %q_start : !tt.ptr<i32>, i32 loc(#loc288)
227
+ %kv_num_blocks_43 = tt.load %kv_num_blocks_42 : !tt.ptr<i32> loc(#loc289)
228
+ %block_n_end_44 = arith.muli %kv_num_blocks_43, %c2_i32 : i32 loc(#loc290)
229
+ %block_n_end_45 = arith.minsi %block_n_end_44, %block_n_end_35 : i32 loc(#loc291)
230
+ %offs_n_46 = tt.splat %kv_start_41 : i32 -> tensor<64xi32> loc(#loc292)
231
+ %offs_n_47 = arith.addi %offs_n_46, %offs_n : tensor<64xi32> loc(#loc292)
232
+ %2 = tt.expand_dims %offs_n_47 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc136)
233
+ %kv_offset_48:5 = scf.for %start_n = %c0_i32 to %block_n_end_45 step %c1_i32 iter_args(%acc_62 = %kv_offset#0, %l_i_63 = %kv_offset#1, %m_i = %kv_offset#2, %offs_n_64 = %2, %kv_offset_65 = %c0_i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 {
234
+ %kv_base_offset = arith.addi %kv_start_41, %kv_offset_65 : i32 loc(#loc374)
235
+ %offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc375)
236
+ %offs_n_load_66 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc375)
237
+ %ptr_67 = tt.expand_dims %offs_n_load_66 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc436)
238
+ %ptr_68 = arith.muli %ptr_67, %cst : tensor<64x1xi32> loc(#loc437)
239
+ %ptr_69 = tt.splat %K : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc438)
240
+ %ptr_70 = tt.addptr %ptr_69, %ptr_68 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc438)
241
+ %ptr_71 = tt.broadcast %ptr_70 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc439)
242
+ %ptr_72 = tt.broadcast %ptr_23 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc439)
243
+ %ptr_73 = tt.addptr %ptr_71, %ptr_72 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc439)
244
+ %k = tt.splat %ks1 : i32 -> tensor<64x1xi32> loc(#loc440)
245
+ %k_74 = arith.cmpi slt, %ptr_67, %k : tensor<64x1xi32> loc(#loc440)
246
+ %k_75 = tt.broadcast %k_74 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc441)
247
+ %k_76 = tt.load %ptr_73, %k_75, %cst_0 : tensor<64x128x!tt.ptr<bf16>> loc(#loc441)
248
+ %k_77 = tt.trans %k_76 {order = array<i32: 1, 0>} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc377)
249
+ %qk = tt.dot %q_30, %k_77, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc378)
250
+ %qk_78 = arith.mulf %qk, %cst_11 : tensor<128x64xf32> loc(#loc379)
251
+ %post_mod_scores = tt.splat %ks1 : i32 -> tensor<1x64xi32> loc(#loc380)
252
+ %post_mod_scores_79 = arith.cmpi slt, %offs_n_64, %post_mod_scores : tensor<1x64xi32> loc(#loc380)
253
+ %post_mod_scores_80 = tt.broadcast %post_mod_scores_79 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc381)
254
+ %post_mod_scores_81 = arith.select %post_mod_scores_80, %qk_78, %cst_10 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc381)
255
+ %post_mod_scores_82 = arith.mulf %post_mod_scores_81, %cst_4 : tensor<128x64xf32> loc(#loc382)
256
+ %m_ij = "tt.reduce"(%post_mod_scores_82) <{axis = 1 : i32}> ({
257
+ ^bb0(%m_ij_115: f32 loc(callsite(#loc1 at #loc383)), %m_ij_116: f32 loc(callsite(#loc1 at #loc383))):
258
+ %m_ij_117 = arith.maxnumf %m_ij_115, %m_ij_116 : f32 loc(#loc469)
259
+ tt.reduce.return %m_ij_117 : f32 loc(#loc442)
260
+ }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc442)
261
+ %m_ij_83 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc384)
262
+ %masked_out_rows = arith.cmpf oeq, %m_ij_83, %cst_3 : tensor<128xf32> loc(#loc385)
263
+ %m_ij_masked = arith.select %masked_out_rows, %cst_14, %m_ij_83 : tensor<128xi1>, tensor<128xf32> loc(#loc386)
264
+ %alpha = arith.subf %m_i, %m_ij_masked : tensor<128xf32> loc(#loc387)
265
+ %alpha_84 = math.exp2 %alpha : tensor<128xf32> loc(#loc388)
266
+ %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc389)
267
+ %p_85 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc390)
268
+ %p_86 = arith.subf %post_mod_scores_82, %p_85 : tensor<128x64xf32> loc(#loc390)
269
+ %p_87 = math.exp2 %p_86 : tensor<128x64xf32> loc(#loc391)
270
+ %l_i_88 = arith.mulf %l_i_63, %alpha_84 : tensor<128xf32> loc(#loc392)
271
+ %l_i_89 = "tt.reduce"(%p_87) <{axis = 1 : i32}> ({
272
+ ^bb0(%l_i_115: f32 loc(callsite(#loc1 at #loc393)), %l_i_116: f32 loc(callsite(#loc1 at #loc393))):
273
+ %l_i_117 = arith.addf %l_i_115, %l_i_116 : f32 loc(#loc470)
274
+ tt.reduce.return %l_i_117 : f32 loc(#loc444)
275
+ }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc444)
276
+ %l_i_90 = arith.addf %l_i_88, %l_i_89 : tensor<128xf32> loc(#loc394)
277
+ %acc_91 = tt.expand_dims %alpha_84 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc395)
278
+ %acc_92 = tt.broadcast %acc_91 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc396)
279
+ %acc_93 = arith.mulf %acc_62, %acc_92 : tensor<128x128xf32> loc(#loc396)
280
+ %ptr_94 = tt.splat %V : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc446)
281
+ %ptr_95 = tt.addptr %ptr_94, %ptr_68 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc446)
282
+ %ptr_96 = tt.broadcast %ptr_95 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc447)
283
+ %ptr_97 = tt.addptr %ptr_96, %ptr_72 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc447)
284
+ %v = tt.load %ptr_97, %k_75, %cst_0 : tensor<64x128x!tt.ptr<bf16>> loc(#loc448)
285
+ %acc_98 = arith.truncf %p_87 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc398)
286
+ %acc_99 = tt.dot %acc_98, %v, %acc_93, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc399)
287
+ %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc449)
288
+ %cur_block = tt.addptr %kv_indices_39, %cur_block_idx : !tt.ptr<i32>, i32 loc(#loc450)
289
+ %cur_block_100 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc451)
290
+ %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc452)
291
+ %next_block_101 = arith.cmpi slt, %next_block, %kv_num_blocks_43 : i32 loc(#loc453)
292
+ %next_block_102 = tt.addptr %cur_block, %c1_i32 : !tt.ptr<i32>, i32 loc(#loc454)
293
+ %next_block_103 = tt.load %next_block_102, %next_block_101 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc455)
294
+ %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc456)
295
+ %needs_jump_104 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc457)
296
+ %needs_jump_105 = arith.cmpi eq, %needs_jump_104, %c0_i32 : i32 loc(#loc458)
297
+ %jump_to_block = arith.subi %next_block_103, %cur_block_100 : i32 loc(#loc459)
298
+ %jump_to_block_106 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc460)
299
+ %jump_to_block_107 = arith.subi %jump_to_block_106, %c64_i32 : i32 loc(#loc461)
300
+ %offset = arith.extui %needs_jump_105 : i1 to i32 loc(#loc462)
301
+ %offset_108 = arith.muli %jump_to_block_107, %offset : i32 loc(#loc462)
302
+ %offset_109 = arith.subi %c1_i32, %offset : i32 loc(#loc463)
303
+ %offset_110 = arith.muli %offset_109, %c64_i32 : i32 loc(#loc464)
304
+ %offset_111 = arith.addi %offset_108, %offset_110 : i32 loc(#loc465)
305
+ %offs_n_112 = tt.splat %offset_111 : i32 -> tensor<1x64xi32> loc(#loc401)
306
+ %offs_n_113 = arith.addi %offs_n_64, %offs_n_112 : tensor<1x64xi32> loc(#loc401)
307
+ %kv_offset_114 = arith.addi %kv_offset_65, %offset_111 : i32 loc(#loc402)
308
+ scf.yield %acc_99, %l_i_90, %m_ij_83, %offs_n_113, %kv_offset_114 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc294)
309
+ } loc(#loc473)
310
+ %l_i_49 = arith.cmpf oeq, %kv_offset_48#1, %cst_14 : tensor<128xf32> loc(#loc295)
311
+ %l_i_50 = arith.select %l_i_49, %l_i, %kv_offset_48#1 : tensor<128xi1>, tensor<128xf32> loc(#loc175)
312
+ %acc_51 = tt.expand_dims %l_i_50 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc296)
313
+ %acc_52 = tt.broadcast %acc_51 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc297)
314
+ %acc_53 = arith.divf %kv_offset_48#0, %acc_52 : tensor<128x128xf32> loc(#loc297)
315
+ %mask_54 = arith.cmpi slt, %ptr_23, %mask : tensor<1x128xi32> loc(#loc174)
316
+ %mask_55 = tt.broadcast %mask_54 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc298)
317
+ %mask_56 = arith.andi %q_29, %mask_55 : tensor<128x128xi1> loc(#loc298)
318
+ %3 = tt.splat %q_offset_15 : i32 -> tensor<1x128xi32> loc(#loc142)
319
+ %4 = arith.addi %ptr_23, %3 : tensor<1x128xi32> loc(#loc142)
320
+ %5 = tt.broadcast %4 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc143)
321
+ %6 = tt.broadcast %ptr_20 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc143)
322
+ %7 = arith.addi %5, %6 : tensor<128x128xi32> loc(#loc143)
323
+ %8 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<128x128x!tt.ptr<bf16>> loc(#loc144)
324
+ %9 = tt.addptr %8, %7 : tensor<128x128x!tt.ptr<bf16>>, tensor<128x128xi32> loc(#loc144)
325
+ %10 = arith.truncf %acc_53 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc145)
326
+ tt.store %9, %10, %mask_56 : tensor<128x128x!tt.ptr<bf16>> loc(#loc145)
327
+ %off_hz = arith.muli %off_zq, %HQ : i32 loc(#loc299)
328
+ %off_hz_57 = arith.addi %off_hz, %off_hq : i32 loc(#loc300)
329
+ %l_ptrs = arith.muli %off_hz_57, %ks0 : i32 loc(#loc301)
330
+ %l_ptrs_58 = tt.addptr %arg_LSE, %l_ptrs : !tt.ptr<f32>, i32 loc(#loc302)
331
+ %l_ptrs_59 = tt.splat %l_ptrs_58 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>> loc(#loc303)
332
+ %l_ptrs_60 = tt.addptr %l_ptrs_59, %offs_m_19 : tensor<128x!tt.ptr<f32>>, tensor<128xi32> loc(#loc303)
333
+ %lse = math.log2 %l_i_50 : tensor<128xf32> loc(#loc304)
334
+ %lse_61 = arith.addf %kv_offset_48#2, %lse : tensor<128xf32> loc(#loc305)
335
+ %11 = tt.splat %ks0 : i32 -> tensor<128xi32> loc(#loc153)
336
+ %12 = arith.cmpi slt, %offs_m_19, %11 : tensor<128xi32> loc(#loc153)
337
+ tt.store %l_ptrs_60, %lse_61, %12 : tensor<128x!tt.ptr<f32>> loc(#loc154)
338
+ tt.return loc(#loc155)
339
+ } loc(#loc)
340
+ } loc(#loc)
341
+ #loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":292:23)
342
+ #loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":146:101)
343
+ #loc5 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31)
344
+ #loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":136:19)
345
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":214:38)
346
+ #loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":206:34)
347
+ #loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":90:9)
348
+ #loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":85:54)
349
+ #loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":97:28)
350
+ #loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":98:27)
351
+ #loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":99:27)
352
+ #loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":104:24)
353
+ #loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":107:24)
354
+ #loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":107:45)
355
+ #loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":107:36)
356
+ #loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":108:47)
357
+ #loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":111:12)
358
+ #loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":112:12)
359
+ #loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":113:12)
360
+ #loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":143:97)
361
+ #loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":144:23)
362
+ #loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":144:46)
363
+ #loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":144:33)
364
+ #loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":284:27)
365
+ #loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":284:38)
366
+ #loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":284:20)
367
+ #loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":284:56)
368
+ #loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":284:49)
369
+ #loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":292:52)
370
+ #loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":151:26)
371
+ #loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":152:23)
372
+ #loc34 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":152:37)
373
+ #loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":153:42)
374
+ #loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":153:28)
375
+ #loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":154:45)
376
+ #loc38 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22)
377
+ #loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":154:92)
378
+ #loc40 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28)
379
+ #loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":154:102)
380
+ #loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":154:65)
381
+ #loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":159:37)
382
+ #loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":159:24)
383
+ #loc45 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":167:48)
384
+ #loc46 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":484:40)
385
+ #loc47 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":342:32)
386
+ #loc49 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":346:35)
387
+ #loc50 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":347:107)
388
+ #loc51 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":349:17)
389
+ #loc52 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":351:19)
390
+ #loc53 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":353:14)
391
+ #loc54 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":257:21)
392
+ #loc55 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":358:36)
393
+ #loc56 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":359:36)
394
+ #loc57 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":367:44)
395
+ #loc58 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":367:69)
396
+ #loc59 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":372:22)
397
+ #loc60 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":374:23)
398
+ #loc61 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":375:22)
399
+ #loc62 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":376:23)
400
+ #loc63 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":377:22)
401
+ #loc64 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":378:22)
402
+ #loc65 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":379:24)
403
+ #loc66 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":380:23)
404
+ #loc67 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:70)
405
+ #loc68 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:79)
406
+ #loc69 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:91)
407
+ #loc70 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:99)
408
+ #loc71 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:102)
409
+ #loc72 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":383:119)
410
+ #loc73 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:70)
411
+ #loc74 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:79)
412
+ #loc75 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:91)
413
+ #loc76 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:99)
414
+ #loc77 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:102)
415
+ #loc78 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":385:119)
416
+ #loc79 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":386:25)
417
+ #loc80 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":387:24)
418
+ #loc81 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":388:23)
419
+ #loc82 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":389:23)
420
+ #loc83 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":394:73)
421
+ #loc84 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":396:69)
422
+ #loc85 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":399:27)
423
+ #loc86 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40)
424
+ #loc88 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27)
425
+ #loc89 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":403:27)
426
+ #loc90 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":405:35)
427
+ #loc91 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":406:51)
428
+ #loc92 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":410:31)
429
+ #loc93 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":410:25)
430
+ #loc94 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":411:51)
431
+ #loc95 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":411:39)
432
+ #loc96 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":411:21)
433
+ #loc97 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":416:16)
434
+ #loc98 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
435
+ #loc100 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
436
+ #loc101 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":416:24)
437
+ #loc102 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":418:22)
438
+ #loc103 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":418:16)
439
+ #loc104 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":421:107)
440
+ #loc105 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":422:22)
441
+ #loc106 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":422:44)
442
+ #loc107 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":247:33)
443
+ #loc108 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":527:63)
444
+ #loc109 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":248:38)
445
+ #loc110 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":248:24)
446
+ #loc111 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":249:109)
447
+ #loc112 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":249:113)
448
+ #loc113 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":249:55)
449
+ #loc114 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":249:25)
450
+ #loc115 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":250:30)
451
+ #loc116 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":250:35)
452
+ #loc117 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":250:60)
453
+ #loc118 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":251:34)
454
+ #loc119 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":251:48)
455
+ #loc120 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":251:63)
456
+ #loc121 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":252:29)
457
+ #loc122 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":252:47)
458
+ #loc123 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":252:61)
459
+ #loc124 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":252:42)
460
+ #loc125 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":530:26)
461
+ #loc126 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":531:21)
462
+ #loc127 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":531:8)
463
+ #loc128 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":181:35)
464
+ #loc129 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":182:27)
465
+ #loc130 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":182:41)
466
+ #loc131 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":183:51)
467
+ #loc132 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":183:32)
468
+ #loc133 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":184:49)
469
+ #loc134 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":184:69)
470
+ #loc135 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":186:28)
471
+ #loc136 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":193:52)
472
+ #loc138 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":206:26)
473
+ #loc139 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":208:20)
474
+ #loc140 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":208:16)
475
+ #loc141 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":214:30)
476
+ #loc142 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":218:49)
477
+ #loc143 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":218:62)
478
+ #loc144 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":218:25)
479
+ #loc145 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":218:92)
480
+ #loc146 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":221:26)
481
+ #loc147 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":221:31)
482
+ #loc148 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":222:32)
483
+ #loc149 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":222:23)
484
+ #loc150 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":222:40)
485
+ #loc151 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":223:33)
486
+ #loc152 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":223:20)
487
+ #loc153 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":227:48)
488
+ #loc154 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":227:29)
489
+ #loc155 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sj/csjgoierbyb7y37xze3raek6tsz2nam2dgfupz3aqsd5fa3pyzto.py":229:4)
490
+ #loc171 = loc(callsite(#loc1 at #loc2))
491
+ #loc172 = loc("q"(#loc4))
492
+ #loc173 = loc("acc"(#loc6))
493
+ #loc174 = loc("mask"(#loc7))
494
+ #loc175 = loc("l_i"(#loc8))
495
+ #loc176 = loc("HQ"(#loc9))
496
+ #loc177 = loc("q_start"(#loc11))
497
+ #loc178 = loc("off_zq"(#loc12))
498
+ #loc179 = loc("off_hq"(#loc13))
499
+ #loc180 = loc("off_hkv"(#loc14))
500
+ #loc181 = loc("q_offset"(#loc15))
501
+ #loc182 = loc("q_offset"(#loc16))
502
+ #loc183 = loc("q_offset"(#loc17))
503
+ #loc184 = loc("k_offset"(#loc18))
504
+ #loc185 = loc("Q"(#loc19))
505
+ #loc186 = loc("K"(#loc20))
506
+ #loc187 = loc("V"(#loc21))
507
+ #loc188 = loc("sparse_kv_idx_offset"(#loc22))
508
+ #loc189 = loc("offs_m"(#loc23))
509
+ #loc190 = loc("offs_m"(#loc24))
510
+ #loc191 = loc("offs_m"(#loc25))
511
+ #loc192 = loc("ptr"(#loc26))
512
+ #loc193 = loc("ptr"(#loc27))
513
+ #loc194 = loc("ptr"(#loc28))
514
+ #loc195 = loc("ptr"(#loc29))
515
+ #loc196 = loc("ptr"(#loc30))
516
+ #loc197 = loc("kv_indices"(#loc32))
517
+ #loc198 = loc("kv_start"(#loc33))
518
+ #loc199 = loc("kv_start"(#loc34))
519
+ #loc200 = loc("kv_num_blocks"(#loc35))
520
+ #loc201 = loc("kv_num_blocks"(#loc36))
521
+ #loc202 = loc("block_n_end"(#loc37))
522
+ #loc203 = loc("block_n_end"(#loc39))
523
+ #loc204 = loc("block_n_end"(#loc41))
524
+ #loc205 = loc("block_n_end"(#loc42))
525
+ #loc206 = loc("offs_n"(#loc43))
526
+ #loc207 = loc("offs_n"(#loc44))
527
+ #loc208 = loc("acc"(#loc46))
528
+ #loc209 = loc("kv_base_offset"(#loc47))
529
+ #loc211 = loc("offs_n_load"(#loc49))
530
+ #loc212 = loc("k"(#loc50))
531
+ #loc213 = loc("k"(#loc51))
532
+ #loc214 = loc("qk"(#loc52))
533
+ #loc215 = loc("qk"(#loc53))
534
+ #loc216 = loc("m"(#loc55))
535
+ #loc217 = loc("n"(#loc56))
536
+ #loc218 = loc("post_mod_scores"(#loc57))
537
+ #loc219 = loc("post_mod_scores"(#loc58))
538
+ #loc220 = loc("tmp3"(#loc59))
539
+ #loc221 = loc("tmp5"(#loc60))
540
+ #loc222 = loc("tmp6"(#loc61))
541
+ #loc223 = loc("tmp7"(#loc62))
542
+ #loc224 = loc("tmp8"(#loc63))
543
+ #loc225 = loc("tmp9"(#loc64))
544
+ #loc226 = loc("tmp10"(#loc65))
545
+ #loc227 = loc("tmp11"(#loc66))
546
+ #loc228 = loc("tmp14"(#loc67))
547
+ #loc229 = loc("tmp14"(#loc68))
548
+ #loc230 = loc("tmp14"(#loc69))
549
+ #loc231 = loc("tmp14"(#loc70))
550
+ #loc232 = loc("tmp14"(#loc71))
551
+ #loc233 = loc("tmp14"(#loc72))
552
+ #loc234 = loc("tmp16"(#loc73))
553
+ #loc235 = loc("tmp16"(#loc74))
554
+ #loc236 = loc("tmp16"(#loc75))
555
+ #loc237 = loc("tmp16"(#loc76))
556
+ #loc238 = loc("tmp16"(#loc77))
557
+ #loc239 = loc("tmp16"(#loc78))
558
+ #loc240 = loc("tmp17"(#loc79))
559
+ #loc241 = loc("tmp18"(#loc80))
560
+ #loc242 = loc("tmp19"(#loc81))
561
+ #loc243 = loc("tmp20"(#loc82))
562
+ #loc244 = loc("mask_mod_output"(#loc83))
563
+ #loc245 = loc("post_mod_scores"(#loc84))
564
+ #loc246 = loc("post_mod_scores"(#loc85))
565
+ #loc248 = loc("m_ij"(#loc89))
566
+ #loc249 = loc("masked_out_rows"(#loc90))
567
+ #loc250 = loc("m_ij_masked"(#loc91))
568
+ #loc251 = loc("alpha"(#loc92))
569
+ #loc252 = loc("alpha"(#loc93))
570
+ #loc253 = loc("p"(#loc94))
571
+ #loc254 = loc("p"(#loc95))
572
+ #loc255 = loc("p"(#loc96))
573
+ #loc256 = loc("l_i"(#loc97))
574
+ #loc258 = loc("l_i"(#loc101))
575
+ #loc259 = loc("acc"(#loc102))
576
+ #loc260 = loc("acc"(#loc103))
577
+ #loc261 = loc("v"(#loc104))
578
+ #loc262 = loc("acc"(#loc105))
579
+ #loc263 = loc("acc"(#loc106))
580
+ #loc264 = loc("cur_block_idx"(#loc107))
581
+ #loc265 = loc("offset"(#loc108))
582
+ #loc266 = loc("cur_block"(#loc109))
583
+ #loc267 = loc("cur_block"(#loc110))
584
+ #loc268 = loc("next_block"(#loc111))
585
+ #loc269 = loc("next_block"(#loc112))
586
+ #loc270 = loc("next_block"(#loc113))
587
+ #loc271 = loc("next_block"(#loc114))
588
+ #loc272 = loc("needs_jump"(#loc115))
589
+ #loc273 = loc("needs_jump"(#loc116))
590
+ #loc274 = loc("needs_jump"(#loc117))
591
+ #loc275 = loc("jump_to_block"(#loc118))
592
+ #loc276 = loc("jump_to_block"(#loc119))
593
+ #loc277 = loc("jump_to_block"(#loc120))
594
+ #loc278 = loc("offset"(#loc121))
595
+ #loc279 = loc("offset"(#loc122))
596
+ #loc280 = loc("offset"(#loc123))
597
+ #loc281 = loc("offset"(#loc124))
598
+ #loc282 = loc("offs_n"(#loc125))
599
+ #loc283 = loc("kv_offset"(#loc126))
600
+ #loc284 = loc(callsite(#loc127 at #loc2))
601
+ #loc285 = loc("kv_indices"(#loc128))
602
+ #loc286 = loc("kv_start"(#loc129))
603
+ #loc287 = loc("kv_start"(#loc130))
604
+ #loc288 = loc("kv_num_blocks"(#loc131))
605
+ #loc289 = loc("kv_num_blocks"(#loc132))
606
+ #loc290 = loc("block_n_end"(#loc133))
607
+ #loc291 = loc("block_n_end"(#loc134))
608
+ #loc292 = loc("offs_n"(#loc135))
609
+ #loc294 = loc(callsite(#loc127 at #loc137))
610
+ #loc295 = loc("l_i"(#loc138))
611
+ #loc296 = loc("acc"(#loc139))
612
+ #loc297 = loc("acc"(#loc140))
613
+ #loc298 = loc("mask"(#loc141))
614
+ #loc299 = loc("off_hz"(#loc146))
615
+ #loc300 = loc("off_hz"(#loc147))
616
+ #loc301 = loc("l_ptrs"(#loc148))
617
+ #loc302 = loc("l_ptrs"(#loc149))
618
+ #loc303 = loc("l_ptrs"(#loc150))
619
+ #loc304 = loc("lse"(#loc151))
620
+ #loc305 = loc("lse"(#loc152))
621
+ #loc306 = loc(callsite(#loc3 at #loc172))
622
+ #loc307 = loc(callsite(#loc5 at #loc173))
623
+ #loc308 = loc(callsite(#loc192 at #loc172))
624
+ #loc309 = loc(callsite(#loc193 at #loc172))
625
+ #loc310 = loc(callsite(#loc194 at #loc172))
626
+ #loc311 = loc(callsite(#loc195 at #loc172))
627
+ #loc312 = loc(callsite(#loc196 at #loc172))
628
+ #loc313 = loc(callsite(#loc31 at #loc172))
629
+ #loc314 = loc(callsite(#loc38 at #loc203))
630
+ #loc315 = loc(callsite(#loc40 at #loc203))
631
+ #loc316 = loc("l_i"(#loc208))
632
+ #loc317 = loc(callsite(#loc209 at #loc210))
633
+ #loc318 = loc(callsite(#loc211 at #loc210))
634
+ #loc319 = loc(callsite(#loc212 at #loc210))
635
+ #loc320 = loc(callsite(#loc213 at #loc210))
636
+ #loc321 = loc(callsite(#loc214 at #loc210))
637
+ #loc322 = loc(callsite(#loc215 at #loc210))
638
+ #loc323 = loc(callsite(#loc216 at #loc210))
639
+ #loc324 = loc(callsite(#loc217 at #loc210))
640
+ #loc325 = loc(callsite(#loc218 at #loc210))
641
+ #loc326 = loc(callsite(#loc219 at #loc210))
642
+ #loc327 = loc(callsite(#loc220 at #loc210))
643
+ #loc328 = loc(callsite(#loc221 at #loc210))
644
+ #loc329 = loc(callsite(#loc222 at #loc210))
645
+ #loc330 = loc(callsite(#loc223 at #loc210))
646
+ #loc331 = loc(callsite(#loc224 at #loc210))
647
+ #loc332 = loc(callsite(#loc225 at #loc210))
648
+ #loc333 = loc(callsite(#loc226 at #loc210))
649
+ #loc334 = loc(callsite(#loc227 at #loc210))
650
+ #loc335 = loc(callsite(#loc228 at #loc210))
651
+ #loc336 = loc(callsite(#loc229 at #loc210))
652
+ #loc337 = loc(callsite(#loc230 at #loc210))
653
+ #loc338 = loc(callsite(#loc231 at #loc210))
654
+ #loc339 = loc(callsite(#loc232 at #loc210))
655
+ #loc340 = loc(callsite(#loc233 at #loc210))
656
+ #loc341 = loc(callsite(#loc234 at #loc210))
657
+ #loc342 = loc(callsite(#loc235 at #loc210))
658
+ #loc343 = loc(callsite(#loc236 at #loc210))
659
+ #loc344 = loc(callsite(#loc237 at #loc210))
660
+ #loc345 = loc(callsite(#loc238 at #loc210))
661
+ #loc346 = loc(callsite(#loc239 at #loc210))
662
+ #loc347 = loc(callsite(#loc240 at #loc210))
663
+ #loc348 = loc(callsite(#loc241 at #loc210))
664
+ #loc349 = loc(callsite(#loc242 at #loc210))
665
+ #loc350 = loc(callsite(#loc243 at #loc210))
666
+ #loc351 = loc(callsite(#loc244 at #loc210))
667
+ #loc352 = loc(callsite(#loc245 at #loc210))
668
+ #loc353 = loc(callsite(#loc246 at #loc210))
669
+ #loc355 = loc(callsite(#loc248 at #loc210))
670
+ #loc356 = loc(callsite(#loc249 at #loc210))
671
+ #loc357 = loc(callsite(#loc250 at #loc210))
672
+ #loc358 = loc(callsite(#loc251 at #loc210))
673
+ #loc359 = loc(callsite(#loc252 at #loc210))
674
+ #loc360 = loc(callsite(#loc253 at #loc210))
675
+ #loc361 = loc(callsite(#loc254 at #loc210))
676
+ #loc362 = loc(callsite(#loc255 at #loc210))
677
+ #loc363 = loc(callsite(#loc256 at #loc210))
678
+ #loc365 = loc(callsite(#loc258 at #loc210))
679
+ #loc366 = loc(callsite(#loc259 at #loc210))
680
+ #loc367 = loc(callsite(#loc260 at #loc210))
681
+ #loc368 = loc(callsite(#loc261 at #loc210))
682
+ #loc369 = loc(callsite(#loc262 at #loc210))
683
+ #loc370 = loc(callsite(#loc263 at #loc210))
684
+ #loc371 = loc(callsite(#loc265 at #loc2))
685
+ #loc372 = loc(callsite(#loc282 at #loc2))
686
+ #loc373 = loc(callsite(#loc283 at #loc2))
687
+ #loc374 = loc(callsite(#loc209 at #loc293))
688
+ #loc375 = loc(callsite(#loc211 at #loc293))
689
+ #loc376 = loc(callsite(#loc212 at #loc293))
690
+ #loc377 = loc(callsite(#loc213 at #loc293))
691
+ #loc378 = loc(callsite(#loc214 at #loc293))
692
+ #loc379 = loc(callsite(#loc215 at #loc293))
693
+ #loc380 = loc(callsite(#loc218 at #loc293))
694
+ #loc381 = loc(callsite(#loc219 at #loc293))
695
+ #loc382 = loc(callsite(#loc246 at #loc293))
696
+ #loc384 = loc(callsite(#loc248 at #loc293))
697
+ #loc385 = loc(callsite(#loc249 at #loc293))
698
+ #loc386 = loc(callsite(#loc250 at #loc293))
699
+ #loc387 = loc(callsite(#loc251 at #loc293))
700
+ #loc388 = loc(callsite(#loc252 at #loc293))
701
+ #loc389 = loc(callsite(#loc253 at #loc293))
702
+ #loc390 = loc(callsite(#loc254 at #loc293))
703
+ #loc391 = loc(callsite(#loc255 at #loc293))
704
+ #loc392 = loc(callsite(#loc256 at #loc293))
705
+ #loc394 = loc(callsite(#loc258 at #loc293))
706
+ #loc395 = loc(callsite(#loc259 at #loc293))
707
+ #loc396 = loc(callsite(#loc260 at #loc293))
708
+ #loc397 = loc(callsite(#loc261 at #loc293))
709
+ #loc398 = loc(callsite(#loc262 at #loc293))
710
+ #loc399 = loc(callsite(#loc263 at #loc293))
711
+ #loc400 = loc(callsite(#loc265 at #loc137))
712
+ #loc401 = loc(callsite(#loc282 at #loc137))
713
+ #loc402 = loc(callsite(#loc283 at #loc137))
714
+ #loc403 = loc("m_i"(#loc316))
715
+ #loc404 = loc(callsite(#loc192 at #loc319))
716
+ #loc405 = loc(callsite(#loc193 at #loc319))
717
+ #loc406 = loc(callsite(#loc194 at #loc319))
718
+ #loc407 = loc(callsite(#loc196 at #loc319))
719
+ #loc408 = loc(callsite(#loc31 at #loc319))
720
+ #loc409 = loc(callsite(#loc3 at #loc319))
721
+ #loc410 = loc(callsite(#loc54 at #loc323))
722
+ #loc411 = loc(callsite(#loc54 at #loc324))
723
+ #loc412 = loc(callsite(#loc86 at #loc354))
724
+ #loc414 = loc(callsite(#loc98 at #loc364))
725
+ #loc416 = loc(callsite(#loc194 at #loc368))
726
+ #loc417 = loc(callsite(#loc196 at #loc368))
727
+ #loc418 = loc(callsite(#loc3 at #loc368))
728
+ #loc419 = loc(callsite(#loc264 at #loc371))
729
+ #loc420 = loc(callsite(#loc266 at #loc371))
730
+ #loc421 = loc(callsite(#loc267 at #loc371))
731
+ #loc422 = loc(callsite(#loc268 at #loc371))
732
+ #loc423 = loc(callsite(#loc269 at #loc371))
733
+ #loc424 = loc(callsite(#loc270 at #loc371))
734
+ #loc425 = loc(callsite(#loc271 at #loc371))
735
+ #loc426 = loc(callsite(#loc272 at #loc371))
736
+ #loc427 = loc(callsite(#loc273 at #loc371))
737
+ #loc428 = loc(callsite(#loc274 at #loc371))
738
+ #loc429 = loc(callsite(#loc275 at #loc371))
739
+ #loc430 = loc(callsite(#loc276 at #loc371))
740
+ #loc431 = loc(callsite(#loc277 at #loc371))
741
+ #loc432 = loc(callsite(#loc278 at #loc371))
742
+ #loc433 = loc(callsite(#loc279 at #loc371))
743
+ #loc434 = loc(callsite(#loc280 at #loc371))
744
+ #loc435 = loc(callsite(#loc281 at #loc371))
745
+ #loc436 = loc(callsite(#loc192 at #loc376))
746
+ #loc437 = loc(callsite(#loc193 at #loc376))
747
+ #loc438 = loc(callsite(#loc194 at #loc376))
748
+ #loc439 = loc(callsite(#loc196 at #loc376))
749
+ #loc440 = loc(callsite(#loc31 at #loc376))
750
+ #loc441 = loc(callsite(#loc3 at #loc376))
751
+ #loc442 = loc(callsite(#loc86 at #loc383))
752
+ #loc444 = loc(callsite(#loc98 at #loc393))
753
+ #loc446 = loc(callsite(#loc194 at #loc397))
754
+ #loc447 = loc(callsite(#loc196 at #loc397))
755
+ #loc448 = loc(callsite(#loc3 at #loc397))
756
+ #loc449 = loc(callsite(#loc264 at #loc400))
757
+ #loc450 = loc(callsite(#loc266 at #loc400))
758
+ #loc451 = loc(callsite(#loc267 at #loc400))
759
+ #loc452 = loc(callsite(#loc268 at #loc400))
760
+ #loc453 = loc(callsite(#loc269 at #loc400))
761
+ #loc454 = loc(callsite(#loc270 at #loc400))
762
+ #loc455 = loc(callsite(#loc271 at #loc400))
763
+ #loc456 = loc(callsite(#loc272 at #loc400))
764
+ #loc457 = loc(callsite(#loc273 at #loc400))
765
+ #loc458 = loc(callsite(#loc274 at #loc400))
766
+ #loc459 = loc(callsite(#loc275 at #loc400))
767
+ #loc460 = loc(callsite(#loc276 at #loc400))
768
+ #loc461 = loc(callsite(#loc277 at #loc400))
769
+ #loc462 = loc(callsite(#loc278 at #loc400))
770
+ #loc463 = loc(callsite(#loc279 at #loc400))
771
+ #loc464 = loc(callsite(#loc280 at #loc400))
772
+ #loc465 = loc(callsite(#loc281 at #loc400))
773
+ #loc466 = loc("offs_n"(#loc403))
774
+ #loc467 = loc(callsite(#loc88 at #loc412))
775
+ #loc468 = loc(callsite(#loc100 at #loc414))
776
+ #loc469 = loc(callsite(#loc88 at #loc442))
777
+ #loc470 = loc(callsite(#loc100 at #loc444))
778
+ #loc471 = loc("kv_offset"(#loc466))
779
+ #loc472 = loc(callsite(#loc471 at #loc2))
780
+ #loc473 = loc(callsite(#loc471 at #loc137))
progress/SpecForge/cache/compiled_kernels/triton/3/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/__grp__triton_per_fused_mul_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_per_fused_mul_1.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.source", "triton_per_fused_mul_1.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.ttir", "triton_per_fused_mul_1.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.ttgir", "triton_per_fused_mul_1.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.llir", "triton_per_fused_mul_1.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.ptx", "triton_per_fused_mul_1.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.cubin", "triton_per_fused_mul_1.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.json"}}
progress/SpecForge/cache/compiled_kernels/triton/3/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.cubin ADDED
Binary file (40.6 kB). View file
 
progress/SpecForge/cache/compiled_kernels/triton/3/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "d74899755dbce272e791805e3b63e07d55dd638854536aa9def087e90d782d2d", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 4096, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused_mul_1"}
progress/SpecForge/cache/compiled_kernels/triton/3/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.llir ADDED
@@ -0,0 +1,1190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+ target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
4
+
5
+ @global_smem = external addrspace(3) global [0 x i8], align 16
6
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
7
+
8
+ ; Function Attrs: nounwind
9
+ define ptx_kernel void @triton_per_fused_mul_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i64 %5, i32 %6, i32 %7, ptr addrspace(1) readnone captures(none) %8, ptr addrspace(1) readnone captures(none) %9) local_unnamed_addr #0 !dbg !5 {
10
+ %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8
11
+ %12 = shl i32 %11, 7, !dbg !9
12
+ %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
13
+ %14 = lshr i32 %13, 5, !dbg !10
14
+ %15 = and i32 %13, 31, !dbg !10
15
+ %16 = shl nuw nsw i32 %15, 2, !dbg !10
16
+ %17 = or disjoint i32 %16, %12, !dbg !11
17
+ %18 = icmp slt i32 %17, %6, !dbg !12
18
+ %19 = and i32 %14, 7, !dbg !13
19
+ %20 = or disjoint i32 %19, 8, !dbg !13
20
+ %21 = or disjoint i32 %19, 16, !dbg !13
21
+ %22 = or i32 %14, 24, !dbg !13
22
+ %23 = sext i32 %17 to i64, !dbg !14
23
+ %24 = shl i64 %5, 5, !dbg !15
24
+ %25 = zext nneg i32 %19 to i64, !dbg !16
25
+ %26 = zext nneg i32 %20 to i64, !dbg !16
26
+ %27 = zext nneg i32 %21 to i64, !dbg !16
27
+ %28 = zext nneg i32 %22 to i64, !dbg !16
28
+ %29 = mul i64 %24, %25, !dbg !16
29
+ %30 = mul i64 %24, %26, !dbg !16
30
+ %31 = mul i64 %24, %27, !dbg !16
31
+ %32 = mul i64 %24, %28, !dbg !16
32
+ %33 = add i64 %29, %23, !dbg !17
33
+ %34 = add i64 %30, %23, !dbg !17
34
+ %35 = add i64 %31, %23, !dbg !17
35
+ %36 = add i64 %32, %23, !dbg !17
36
+ %37 = getelementptr float, ptr addrspace(1) %0, i64 %33, !dbg !18
37
+ %38 = getelementptr float, ptr addrspace(1) %0, i64 %34, !dbg !18
38
+ %39 = getelementptr float, ptr addrspace(1) %0, i64 %35, !dbg !18
39
+ %40 = getelementptr float, ptr addrspace(1) %0, i64 %36, !dbg !18
40
+ %41 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %37, i1 %18) #6, !dbg !19
41
+ %42 = extractvalue { i32, i32, i32, i32 } %41, 0, !dbg !19
42
+ %43 = extractvalue { i32, i32, i32, i32 } %41, 1, !dbg !19
43
+ %44 = extractvalue { i32, i32, i32, i32 } %41, 2, !dbg !19
44
+ %45 = extractvalue { i32, i32, i32, i32 } %41, 3, !dbg !19
45
+ %46 = bitcast i32 %42 to float, !dbg !19
46
+ %47 = bitcast i32 %43 to float, !dbg !19
47
+ %48 = bitcast i32 %44 to float, !dbg !19
48
+ %49 = bitcast i32 %45 to float, !dbg !19
49
+ %50 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %38, i1 %18) #6, !dbg !19
50
+ %51 = extractvalue { i32, i32, i32, i32 } %50, 0, !dbg !19
51
+ %52 = extractvalue { i32, i32, i32, i32 } %50, 1, !dbg !19
52
+ %53 = extractvalue { i32, i32, i32, i32 } %50, 2, !dbg !19
53
+ %54 = extractvalue { i32, i32, i32, i32 } %50, 3, !dbg !19
54
+ %55 = bitcast i32 %51 to float, !dbg !19
55
+ %56 = bitcast i32 %52 to float, !dbg !19
56
+ %57 = bitcast i32 %53 to float, !dbg !19
57
+ %58 = bitcast i32 %54 to float, !dbg !19
58
+ %59 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %39, i1 %18) #6, !dbg !19
59
+ %60 = extractvalue { i32, i32, i32, i32 } %59, 0, !dbg !19
60
+ %61 = extractvalue { i32, i32, i32, i32 } %59, 1, !dbg !19
61
+ %62 = extractvalue { i32, i32, i32, i32 } %59, 2, !dbg !19
62
+ %63 = extractvalue { i32, i32, i32, i32 } %59, 3, !dbg !19
63
+ %64 = bitcast i32 %60 to float, !dbg !19
64
+ %65 = bitcast i32 %61 to float, !dbg !19
65
+ %66 = bitcast i32 %62 to float, !dbg !19
66
+ %67 = bitcast i32 %63 to float, !dbg !19
67
+ %68 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %40, i1 %18) #6, !dbg !19
68
+ %69 = extractvalue { i32, i32, i32, i32 } %68, 0, !dbg !19
69
+ %70 = extractvalue { i32, i32, i32, i32 } %68, 1, !dbg !19
70
+ %71 = extractvalue { i32, i32, i32, i32 } %68, 2, !dbg !19
71
+ %72 = extractvalue { i32, i32, i32, i32 } %68, 3, !dbg !19
72
+ %73 = bitcast i32 %69 to float, !dbg !19
73
+ %74 = bitcast i32 %70 to float, !dbg !19
74
+ %75 = bitcast i32 %71 to float, !dbg !19
75
+ %76 = bitcast i32 %72 to float, !dbg !19
76
+ %77 = getelementptr float, ptr addrspace(1) %1, i64 %33, !dbg !20
77
+ %78 = getelementptr float, ptr addrspace(1) %1, i64 %34, !dbg !20
78
+ %79 = getelementptr float, ptr addrspace(1) %1, i64 %35, !dbg !20
79
+ %80 = getelementptr float, ptr addrspace(1) %1, i64 %36, !dbg !20
80
+ %81 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %77, i1 %18) #6, !dbg !21
81
+ %82 = extractvalue { i32, i32, i32, i32 } %81, 0, !dbg !21
82
+ %83 = extractvalue { i32, i32, i32, i32 } %81, 1, !dbg !21
83
+ %84 = extractvalue { i32, i32, i32, i32 } %81, 2, !dbg !21
84
+ %85 = extractvalue { i32, i32, i32, i32 } %81, 3, !dbg !21
85
+ %86 = bitcast i32 %82 to float, !dbg !21
86
+ %87 = bitcast i32 %83 to float, !dbg !21
87
+ %88 = bitcast i32 %84 to float, !dbg !21
88
+ %89 = bitcast i32 %85 to float, !dbg !21
89
+ %90 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %78, i1 %18) #6, !dbg !21
90
+ %91 = extractvalue { i32, i32, i32, i32 } %90, 0, !dbg !21
91
+ %92 = extractvalue { i32, i32, i32, i32 } %90, 1, !dbg !21
92
+ %93 = extractvalue { i32, i32, i32, i32 } %90, 2, !dbg !21
93
+ %94 = extractvalue { i32, i32, i32, i32 } %90, 3, !dbg !21
94
+ %95 = bitcast i32 %91 to float, !dbg !21
95
+ %96 = bitcast i32 %92 to float, !dbg !21
96
+ %97 = bitcast i32 %93 to float, !dbg !21
97
+ %98 = bitcast i32 %94 to float, !dbg !21
98
+ %99 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %79, i1 %18) #6, !dbg !21
99
+ %100 = extractvalue { i32, i32, i32, i32 } %99, 0, !dbg !21
100
+ %101 = extractvalue { i32, i32, i32, i32 } %99, 1, !dbg !21
101
+ %102 = extractvalue { i32, i32, i32, i32 } %99, 2, !dbg !21
102
+ %103 = extractvalue { i32, i32, i32, i32 } %99, 3, !dbg !21
103
+ %104 = bitcast i32 %100 to float, !dbg !21
104
+ %105 = bitcast i32 %101 to float, !dbg !21
105
+ %106 = bitcast i32 %102 to float, !dbg !21
106
+ %107 = bitcast i32 %103 to float, !dbg !21
107
+ %108 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %80, i1 %18) #6, !dbg !21
108
+ %109 = extractvalue { i32, i32, i32, i32 } %108, 0, !dbg !21
109
+ %110 = extractvalue { i32, i32, i32, i32 } %108, 1, !dbg !21
110
+ %111 = extractvalue { i32, i32, i32, i32 } %108, 2, !dbg !21
111
+ %112 = extractvalue { i32, i32, i32, i32 } %108, 3, !dbg !21
112
+ %113 = bitcast i32 %109 to float, !dbg !21
113
+ %114 = bitcast i32 %110 to float, !dbg !21
114
+ %115 = bitcast i32 %111 to float, !dbg !21
115
+ %116 = bitcast i32 %112 to float, !dbg !21
116
+ %117 = select i1 %18, float %46, float 0xFFF0000000000000, !dbg !22
117
+ %118 = select i1 %18, float %47, float 0xFFF0000000000000, !dbg !22
118
+ %119 = select i1 %18, float %48, float 0xFFF0000000000000, !dbg !22
119
+ %120 = select i1 %18, float %49, float 0xFFF0000000000000, !dbg !22
120
+ %121 = select i1 %18, float %55, float 0xFFF0000000000000, !dbg !22
121
+ %122 = select i1 %18, float %56, float 0xFFF0000000000000, !dbg !22
122
+ %123 = select i1 %18, float %57, float 0xFFF0000000000000, !dbg !22
123
+ %124 = select i1 %18, float %58, float 0xFFF0000000000000, !dbg !22
124
+ %125 = select i1 %18, float %64, float 0xFFF0000000000000, !dbg !22
125
+ %126 = select i1 %18, float %65, float 0xFFF0000000000000, !dbg !22
126
+ %127 = select i1 %18, float %66, float 0xFFF0000000000000, !dbg !22
127
+ %128 = select i1 %18, float %67, float 0xFFF0000000000000, !dbg !22
128
+ %129 = select i1 %18, float %73, float 0xFFF0000000000000, !dbg !22
129
+ %130 = select i1 %18, float %74, float 0xFFF0000000000000, !dbg !22
130
+ %131 = select i1 %18, float %75, float 0xFFF0000000000000, !dbg !22
131
+ %132 = select i1 %18, float %76, float 0xFFF0000000000000, !dbg !22
132
+ %133 = fcmp ogt float %117, %121, !dbg !23
133
+ %134 = fcmp uno float %117, 0.000000e+00, !dbg !27
134
+ %135 = or i1 %134, %133, !dbg !28
135
+ %136 = select i1 %135, float %117, float %121, !dbg !29
136
+ %137 = fcmp ogt float %118, %122, !dbg !23
137
+ %138 = fcmp uno float %118, 0.000000e+00, !dbg !27
138
+ %139 = or i1 %138, %137, !dbg !28
139
+ %140 = select i1 %139, float %118, float %122, !dbg !29
140
+ %141 = fcmp ogt float %119, %123, !dbg !23
141
+ %142 = fcmp uno float %119, 0.000000e+00, !dbg !27
142
+ %143 = or i1 %142, %141, !dbg !28
143
+ %144 = select i1 %143, float %119, float %123, !dbg !29
144
+ %145 = fcmp ogt float %120, %124, !dbg !23
145
+ %146 = fcmp uno float %120, 0.000000e+00, !dbg !27
146
+ %147 = or i1 %146, %145, !dbg !28
147
+ %148 = select i1 %147, float %120, float %124, !dbg !29
148
+ %149 = fcmp ogt float %136, %125, !dbg !23
149
+ %150 = fcmp uno float %136, 0.000000e+00, !dbg !27
150
+ %151 = or i1 %149, %150, !dbg !28
151
+ %152 = select i1 %151, float %136, float %125, !dbg !29
152
+ %153 = fcmp ogt float %140, %126, !dbg !23
153
+ %154 = fcmp uno float %140, 0.000000e+00, !dbg !27
154
+ %155 = or i1 %153, %154, !dbg !28
155
+ %156 = select i1 %155, float %140, float %126, !dbg !29
156
+ %157 = fcmp ogt float %144, %127, !dbg !23
157
+ %158 = fcmp uno float %144, 0.000000e+00, !dbg !27
158
+ %159 = or i1 %157, %158, !dbg !28
159
+ %160 = select i1 %159, float %144, float %127, !dbg !29
160
+ %161 = fcmp ogt float %148, %128, !dbg !23
161
+ %162 = fcmp uno float %148, 0.000000e+00, !dbg !27
162
+ %163 = or i1 %161, %162, !dbg !28
163
+ %164 = select i1 %163, float %148, float %128, !dbg !29
164
+ %165 = fcmp ogt float %152, %129, !dbg !23
165
+ %166 = fcmp uno float %152, 0.000000e+00, !dbg !27
166
+ %167 = or i1 %165, %166, !dbg !28
167
+ %168 = select i1 %167, float %152, float %129, !dbg !29
168
+ %169 = fcmp ogt float %156, %130, !dbg !23
169
+ %170 = fcmp uno float %156, 0.000000e+00, !dbg !27
170
+ %171 = or i1 %169, %170, !dbg !28
171
+ %172 = select i1 %171, float %156, float %130, !dbg !29
172
+ %173 = fcmp ogt float %160, %131, !dbg !23
173
+ %174 = fcmp uno float %160, 0.000000e+00, !dbg !27
174
+ %175 = or i1 %173, %174, !dbg !28
175
+ %176 = select i1 %175, float %160, float %131, !dbg !29
176
+ %177 = fcmp ogt float %164, %132, !dbg !23
177
+ %178 = fcmp uno float %164, 0.000000e+00, !dbg !27
178
+ %179 = or i1 %177, %178, !dbg !28
179
+ %180 = select i1 %179, float %164, float %132, !dbg !29
180
+ %.idx = shl nuw nsw i32 %15, 7, !dbg !30
181
+ %181 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx, !dbg !30
182
+ %182 = getelementptr float, ptr addrspace(3) %181, i32 %19, !dbg !30
183
+ %183 = bitcast float %168 to <1 x i32>, !dbg !30
184
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %182, <1 x i32> %183, i1 true) #6, !dbg !30
185
+ %184 = getelementptr i8, ptr addrspace(3) %181, i32 32, !dbg !30
186
+ %185 = getelementptr float, ptr addrspace(3) %184, i32 %19, !dbg !30
187
+ %186 = bitcast float %172 to <1 x i32>, !dbg !30
188
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %185, <1 x i32> %186, i1 true) #6, !dbg !30
189
+ %187 = getelementptr i8, ptr addrspace(3) %181, i32 64, !dbg !30
190
+ %188 = getelementptr float, ptr addrspace(3) %187, i32 %19, !dbg !30
191
+ %189 = bitcast float %176 to <1 x i32>, !dbg !30
192
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %189, i1 true) #6, !dbg !30
193
+ %190 = getelementptr i8, ptr addrspace(3) %181, i32 96, !dbg !30
194
+ %191 = getelementptr float, ptr addrspace(3) %190, i32 %19, !dbg !30
195
+ %192 = bitcast float %180 to <1 x i32>, !dbg !30
196
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %192, i1 true) #6, !dbg !30
197
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30
198
+ %193 = getelementptr float, ptr addrspace(3) @global_smem, i32 %13, !dbg !30
199
+ %194 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %193, i1 true) #6, !dbg !30
200
+ %195 = bitcast i32 %194 to float, !dbg !30
201
+ %196 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %194, i32 4, i32 31), !dbg !30
202
+ %197 = bitcast i32 %196 to float, !dbg !30
203
+ %198 = fcmp ogt float %195, %197, !dbg !23
204
+ %199 = fcmp uno float %195, 0.000000e+00, !dbg !27
205
+ %200 = or i1 %199, %198, !dbg !28
206
+ %201 = select i1 %200, float %195, float %197, !dbg !29
207
+ %202 = bitcast float %201 to i32, !dbg !30
208
+ %203 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %202, i32 2, i32 31), !dbg !30
209
+ %204 = bitcast i32 %203 to float, !dbg !30
210
+ %205 = fcmp ogt float %201, %204, !dbg !23
211
+ %206 = fcmp uno float %201, 0.000000e+00, !dbg !27
212
+ %207 = or i1 %205, %206, !dbg !28
213
+ %208 = select i1 %207, float %201, float %204, !dbg !29
214
+ %209 = bitcast float %208 to i32, !dbg !30
215
+ %210 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %209, i32 1, i32 31), !dbg !30
216
+ %211 = bitcast i32 %210 to float, !dbg !30
217
+ %212 = fcmp ogt float %208, %211, !dbg !23
218
+ %213 = fcmp uno float %208, 0.000000e+00, !dbg !27
219
+ %214 = or i1 %212, %213, !dbg !28
220
+ %215 = and i32 %13, 7, !dbg !30
221
+ %216 = icmp eq i32 %215, 0, !dbg !30
222
+ %217 = select i1 %214, i32 %209, i32 %210, !dbg !29
223
+ %218 = insertelement <1 x i32> poison, i32 %217, i64 0, !dbg !30
224
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %193, <1 x i32> %218, i1 %216) #6, !dbg !30
225
+ %219 = getelementptr i8, ptr addrspace(3) %193, i32 1024, !dbg !30
226
+ %220 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %219, i1 true) #6, !dbg !30
227
+ %221 = bitcast i32 %220 to float, !dbg !30
228
+ %222 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %220, i32 4, i32 31), !dbg !30
229
+ %223 = bitcast i32 %222 to float, !dbg !30
230
+ %224 = fcmp ogt float %221, %223, !dbg !23
231
+ %225 = fcmp uno float %221, 0.000000e+00, !dbg !27
232
+ %226 = or i1 %225, %224, !dbg !28
233
+ %227 = select i1 %226, float %221, float %223, !dbg !29
234
+ %228 = bitcast float %227 to i32, !dbg !30
235
+ %229 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %228, i32 2, i32 31), !dbg !30
236
+ %230 = bitcast i32 %229 to float, !dbg !30
237
+ %231 = fcmp ogt float %227, %230, !dbg !23
238
+ %232 = fcmp uno float %227, 0.000000e+00, !dbg !27
239
+ %233 = or i1 %231, %232, !dbg !28
240
+ %234 = select i1 %233, float %227, float %230, !dbg !29
241
+ %235 = bitcast float %234 to i32, !dbg !30
242
+ %236 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %235, i32 1, i32 31), !dbg !30
243
+ %237 = bitcast i32 %236 to float, !dbg !30
244
+ %238 = fcmp ogt float %234, %237, !dbg !23
245
+ %239 = fcmp uno float %234, 0.000000e+00, !dbg !27
246
+ %240 = or i1 %238, %239, !dbg !28
247
+ %241 = select i1 %240, i32 %235, i32 %236, !dbg !29
248
+ %242 = insertelement <1 x i32> poison, i32 %241, i64 0, !dbg !30
249
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %219, <1 x i32> %242, i1 %216) #6, !dbg !30
250
+ %243 = getelementptr i8, ptr addrspace(3) %193, i32 2048, !dbg !30
251
+ %244 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %243, i1 true) #6, !dbg !30
252
+ %245 = bitcast i32 %244 to float, !dbg !30
253
+ %246 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %244, i32 4, i32 31), !dbg !30
254
+ %247 = bitcast i32 %246 to float, !dbg !30
255
+ %248 = fcmp ogt float %245, %247, !dbg !23
256
+ %249 = fcmp uno float %245, 0.000000e+00, !dbg !27
257
+ %250 = or i1 %249, %248, !dbg !28
258
+ %251 = select i1 %250, float %245, float %247, !dbg !29
259
+ %252 = bitcast float %251 to i32, !dbg !30
260
+ %253 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %252, i32 2, i32 31), !dbg !30
261
+ %254 = bitcast i32 %253 to float, !dbg !30
262
+ %255 = fcmp ogt float %251, %254, !dbg !23
263
+ %256 = fcmp uno float %251, 0.000000e+00, !dbg !27
264
+ %257 = or i1 %255, %256, !dbg !28
265
+ %258 = select i1 %257, float %251, float %254, !dbg !29
266
+ %259 = bitcast float %258 to i32, !dbg !30
267
+ %260 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %259, i32 1, i32 31), !dbg !30
268
+ %261 = bitcast i32 %260 to float, !dbg !30
269
+ %262 = fcmp ogt float %258, %261, !dbg !23
270
+ %263 = fcmp uno float %258, 0.000000e+00, !dbg !27
271
+ %264 = or i1 %262, %263, !dbg !28
272
+ %265 = select i1 %264, i32 %259, i32 %260, !dbg !29
273
+ %266 = insertelement <1 x i32> poison, i32 %265, i64 0, !dbg !30
274
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %243, <1 x i32> %266, i1 %216) #6, !dbg !30
275
+ %267 = getelementptr i8, ptr addrspace(3) %193, i32 3072, !dbg !30
276
+ %268 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %267, i1 true) #6, !dbg !30
277
+ %269 = bitcast i32 %268 to float, !dbg !30
278
+ %270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 4, i32 31), !dbg !30
279
+ %271 = bitcast i32 %270 to float, !dbg !30
280
+ %272 = fcmp ogt float %269, %271, !dbg !23
281
+ %273 = fcmp uno float %269, 0.000000e+00, !dbg !27
282
+ %274 = or i1 %273, %272, !dbg !28
283
+ %275 = select i1 %274, float %269, float %271, !dbg !29
284
+ %276 = bitcast float %275 to i32, !dbg !30
285
+ %277 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %276, i32 2, i32 31), !dbg !30
286
+ %278 = bitcast i32 %277 to float, !dbg !30
287
+ %279 = fcmp ogt float %275, %278, !dbg !23
288
+ %280 = fcmp uno float %275, 0.000000e+00, !dbg !27
289
+ %281 = or i1 %279, %280, !dbg !28
290
+ %282 = select i1 %281, float %275, float %278, !dbg !29
291
+ %283 = bitcast float %282 to i32, !dbg !30
292
+ %284 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %283, i32 1, i32 31), !dbg !30
293
+ %285 = bitcast i32 %284 to float, !dbg !30
294
+ %286 = fcmp ogt float %282, %285, !dbg !23
295
+ %287 = fcmp uno float %282, 0.000000e+00, !dbg !27
296
+ %288 = or i1 %286, %287, !dbg !28
297
+ %289 = select i1 %288, i32 %283, i32 %284, !dbg !29
298
+ %290 = insertelement <1 x i32> poison, i32 %289, i64 0, !dbg !30
299
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %267, <1 x i32> %290, i1 %216) #6, !dbg !30
300
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30
301
+ %291 = load float, ptr addrspace(3) %181, align 16, !dbg !30
302
+ %292 = load float, ptr addrspace(3) %184, align 16, !dbg !30
303
+ %293 = load float, ptr addrspace(3) %187, align 16, !dbg !30
304
+ %294 = load float, ptr addrspace(3) %190, align 16, !dbg !30
305
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !31
306
+ %295 = shl nuw nsw i32 %15, 4, !dbg !31
307
+ %296 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %295, !dbg !31
308
+ %297 = insertelement <4 x float> poison, float %291, i64 0, !dbg !31
309
+ %298 = insertelement <4 x float> %297, float %292, i64 1, !dbg !31
310
+ %299 = insertelement <4 x float> %298, float %293, i64 2, !dbg !31
311
+ %300 = insertelement <4 x float> %299, float %294, i64 3, !dbg !31
312
+ store <4 x float> %300, ptr addrspace(3) %296, align 16, !dbg !31
313
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !31
314
+ %301 = shl nuw nsw i32 %14, 7, !dbg !31
315
+ %302 = shl nuw nsw i32 %13, 4, !dbg !31
316
+ %303 = or i32 %301, %302, !dbg !31
317
+ %304 = and i32 %303, 496, !dbg !31
318
+ %305 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %304, !dbg !31
319
+ %306 = ptrtoint ptr addrspace(3) %305 to i32, !dbg !31
320
+ %307 = tail call i32 asm sideeffect "ldmatrix.sync.aligned.m8n8.x1.shared.b16 {$0}, [$1];", "=r,r"(i32 %306) #6, !dbg !31
321
+ %308 = fcmp oeq float %291, 0xFFF0000000000000, !dbg !32
322
+ %309 = fcmp oeq float %292, 0xFFF0000000000000, !dbg !32
323
+ %310 = fcmp oeq float %293, 0xFFF0000000000000, !dbg !32
324
+ %311 = fcmp oeq float %294, 0xFFF0000000000000, !dbg !32
325
+ %312 = fsub float %46, %291, !dbg !33
326
+ %313 = fsub float %47, %292, !dbg !33
327
+ %314 = fsub float %48, %293, !dbg !33
328
+ %315 = fsub float %49, %294, !dbg !33
329
+ %316 = fsub float %55, %291, !dbg !33
330
+ %317 = fsub float %56, %292, !dbg !33
331
+ %318 = fsub float %57, %293, !dbg !33
332
+ %319 = fsub float %58, %294, !dbg !33
333
+ %320 = fsub float %64, %291, !dbg !33
334
+ %321 = fsub float %65, %292, !dbg !33
335
+ %322 = fsub float %66, %293, !dbg !33
336
+ %323 = fsub float %67, %294, !dbg !33
337
+ %324 = fsub float %73, %291, !dbg !33
338
+ %325 = fsub float %74, %292, !dbg !33
339
+ %326 = fsub float %75, %293, !dbg !33
340
+ %327 = fsub float %76, %294, !dbg !33
341
+ %328 = select i1 %308, float 0.000000e+00, float %312, !dbg !34
342
+ %329 = select i1 %309, float 0.000000e+00, float %313, !dbg !34
343
+ %330 = select i1 %310, float 0.000000e+00, float %314, !dbg !34
344
+ %331 = select i1 %311, float 0.000000e+00, float %315, !dbg !34
345
+ %332 = select i1 %308, float 0.000000e+00, float %316, !dbg !34
346
+ %333 = select i1 %309, float 0.000000e+00, float %317, !dbg !34
347
+ %334 = select i1 %310, float 0.000000e+00, float %318, !dbg !34
348
+ %335 = select i1 %311, float 0.000000e+00, float %319, !dbg !34
349
+ %336 = select i1 %308, float 0.000000e+00, float %320, !dbg !34
350
+ %337 = select i1 %309, float 0.000000e+00, float %321, !dbg !34
351
+ %338 = select i1 %310, float 0.000000e+00, float %322, !dbg !34
352
+ %339 = select i1 %311, float 0.000000e+00, float %323, !dbg !34
353
+ %340 = select i1 %308, float 0.000000e+00, float %324, !dbg !34
354
+ %341 = select i1 %309, float 0.000000e+00, float %325, !dbg !34
355
+ %342 = select i1 %310, float 0.000000e+00, float %326, !dbg !34
356
+ %343 = select i1 %311, float 0.000000e+00, float %327, !dbg !34
357
+ %344 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
358
+ %.not.i = icmp eq i32 %344, 0, !dbg !35
359
+ br i1 %.not.i, label %347, label %345, !dbg !35
360
+
361
+ 345: ; preds = %10
362
+ %346 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %328) #6, !dbg !35
363
+ br label %__nv_exp2f.exit, !dbg !35
364
+
365
+ 347: ; preds = %10
366
+ %348 = tail call float @llvm.nvvm.ex2.approx.f(float %328) #6, !dbg !35
367
+ br label %__nv_exp2f.exit, !dbg !35
368
+
369
+ __nv_exp2f.exit: ; preds = %345, %347
370
+ %.0.i = phi float [ %346, %345 ], [ %348, %347 ], !dbg !35
371
+ %349 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
372
+ %.not.i1 = icmp eq i32 %349, 0, !dbg !35
373
+ br i1 %.not.i1, label %352, label %350, !dbg !35
374
+
375
+ 350: ; preds = %__nv_exp2f.exit
376
+ %351 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %329) #6, !dbg !35
377
+ br label %__nv_exp2f.exit3, !dbg !35
378
+
379
+ 352: ; preds = %__nv_exp2f.exit
380
+ %353 = tail call float @llvm.nvvm.ex2.approx.f(float %329) #6, !dbg !35
381
+ br label %__nv_exp2f.exit3, !dbg !35
382
+
383
+ __nv_exp2f.exit3: ; preds = %350, %352
384
+ %.0.i2 = phi float [ %351, %350 ], [ %353, %352 ], !dbg !35
385
+ %354 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
386
+ %.not.i4 = icmp eq i32 %354, 0, !dbg !35
387
+ br i1 %.not.i4, label %357, label %355, !dbg !35
388
+
389
+ 355: ; preds = %__nv_exp2f.exit3
390
+ %356 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %330) #6, !dbg !35
391
+ br label %__nv_exp2f.exit6, !dbg !35
392
+
393
+ 357: ; preds = %__nv_exp2f.exit3
394
+ %358 = tail call float @llvm.nvvm.ex2.approx.f(float %330) #6, !dbg !35
395
+ br label %__nv_exp2f.exit6, !dbg !35
396
+
397
+ __nv_exp2f.exit6: ; preds = %355, %357
398
+ %.0.i5 = phi float [ %356, %355 ], [ %358, %357 ], !dbg !35
399
+ %359 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
400
+ %.not.i7 = icmp eq i32 %359, 0, !dbg !35
401
+ br i1 %.not.i7, label %362, label %360, !dbg !35
402
+
403
+ 360: ; preds = %__nv_exp2f.exit6
404
+ %361 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %331) #6, !dbg !35
405
+ br label %__nv_exp2f.exit9, !dbg !35
406
+
407
+ 362: ; preds = %__nv_exp2f.exit6
408
+ %363 = tail call float @llvm.nvvm.ex2.approx.f(float %331) #6, !dbg !35
409
+ br label %__nv_exp2f.exit9, !dbg !35
410
+
411
+ __nv_exp2f.exit9: ; preds = %360, %362
412
+ %.0.i8 = phi float [ %361, %360 ], [ %363, %362 ], !dbg !35
413
+ %364 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
414
+ %.not.i10 = icmp eq i32 %364, 0, !dbg !35
415
+ br i1 %.not.i10, label %367, label %365, !dbg !35
416
+
417
+ 365: ; preds = %__nv_exp2f.exit9
418
+ %366 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %332) #6, !dbg !35
419
+ br label %__nv_exp2f.exit12, !dbg !35
420
+
421
+ 367: ; preds = %__nv_exp2f.exit9
422
+ %368 = tail call float @llvm.nvvm.ex2.approx.f(float %332) #6, !dbg !35
423
+ br label %__nv_exp2f.exit12, !dbg !35
424
+
425
+ __nv_exp2f.exit12: ; preds = %365, %367
426
+ %.0.i11 = phi float [ %366, %365 ], [ %368, %367 ], !dbg !35
427
+ %369 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
428
+ %.not.i13 = icmp eq i32 %369, 0, !dbg !35
429
+ br i1 %.not.i13, label %372, label %370, !dbg !35
430
+
431
+ 370: ; preds = %__nv_exp2f.exit12
432
+ %371 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %333) #6, !dbg !35
433
+ br label %__nv_exp2f.exit15, !dbg !35
434
+
435
+ 372: ; preds = %__nv_exp2f.exit12
436
+ %373 = tail call float @llvm.nvvm.ex2.approx.f(float %333) #6, !dbg !35
437
+ br label %__nv_exp2f.exit15, !dbg !35
438
+
439
+ __nv_exp2f.exit15: ; preds = %370, %372
440
+ %.0.i14 = phi float [ %371, %370 ], [ %373, %372 ], !dbg !35
441
+ %374 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
442
+ %.not.i16 = icmp eq i32 %374, 0, !dbg !35
443
+ br i1 %.not.i16, label %377, label %375, !dbg !35
444
+
445
+ 375: ; preds = %__nv_exp2f.exit15
446
+ %376 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %334) #6, !dbg !35
447
+ br label %__nv_exp2f.exit18, !dbg !35
448
+
449
+ 377: ; preds = %__nv_exp2f.exit15
450
+ %378 = tail call float @llvm.nvvm.ex2.approx.f(float %334) #6, !dbg !35
451
+ br label %__nv_exp2f.exit18, !dbg !35
452
+
453
+ __nv_exp2f.exit18: ; preds = %375, %377
454
+ %.0.i17 = phi float [ %376, %375 ], [ %378, %377 ], !dbg !35
455
+ %379 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
456
+ %.not.i19 = icmp eq i32 %379, 0, !dbg !35
457
+ br i1 %.not.i19, label %382, label %380, !dbg !35
458
+
459
+ 380: ; preds = %__nv_exp2f.exit18
460
+ %381 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %335) #6, !dbg !35
461
+ br label %__nv_exp2f.exit21, !dbg !35
462
+
463
+ 382: ; preds = %__nv_exp2f.exit18
464
+ %383 = tail call float @llvm.nvvm.ex2.approx.f(float %335) #6, !dbg !35
465
+ br label %__nv_exp2f.exit21, !dbg !35
466
+
467
+ __nv_exp2f.exit21: ; preds = %380, %382
468
+ %.0.i20 = phi float [ %381, %380 ], [ %383, %382 ], !dbg !35
469
+ %384 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
470
+ %.not.i22 = icmp eq i32 %384, 0, !dbg !35
471
+ br i1 %.not.i22, label %387, label %385, !dbg !35
472
+
473
+ 385: ; preds = %__nv_exp2f.exit21
474
+ %386 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %336) #6, !dbg !35
475
+ br label %__nv_exp2f.exit24, !dbg !35
476
+
477
+ 387: ; preds = %__nv_exp2f.exit21
478
+ %388 = tail call float @llvm.nvvm.ex2.approx.f(float %336) #6, !dbg !35
479
+ br label %__nv_exp2f.exit24, !dbg !35
480
+
481
+ __nv_exp2f.exit24: ; preds = %385, %387
482
+ %.0.i23 = phi float [ %386, %385 ], [ %388, %387 ], !dbg !35
483
+ %389 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
484
+ %.not.i25 = icmp eq i32 %389, 0, !dbg !35
485
+ br i1 %.not.i25, label %392, label %390, !dbg !35
486
+
487
+ 390: ; preds = %__nv_exp2f.exit24
488
+ %391 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %337) #6, !dbg !35
489
+ br label %__nv_exp2f.exit27, !dbg !35
490
+
491
+ 392: ; preds = %__nv_exp2f.exit24
492
+ %393 = tail call float @llvm.nvvm.ex2.approx.f(float %337) #6, !dbg !35
493
+ br label %__nv_exp2f.exit27, !dbg !35
494
+
495
+ __nv_exp2f.exit27: ; preds = %390, %392
496
+ %.0.i26 = phi float [ %391, %390 ], [ %393, %392 ], !dbg !35
497
+ %394 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
498
+ %.not.i28 = icmp eq i32 %394, 0, !dbg !35
499
+ br i1 %.not.i28, label %397, label %395, !dbg !35
500
+
501
+ 395: ; preds = %__nv_exp2f.exit27
502
+ %396 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %338) #6, !dbg !35
503
+ br label %__nv_exp2f.exit30, !dbg !35
504
+
505
+ 397: ; preds = %__nv_exp2f.exit27
506
+ %398 = tail call float @llvm.nvvm.ex2.approx.f(float %338) #6, !dbg !35
507
+ br label %__nv_exp2f.exit30, !dbg !35
508
+
509
+ __nv_exp2f.exit30: ; preds = %395, %397
510
+ %.0.i29 = phi float [ %396, %395 ], [ %398, %397 ], !dbg !35
511
+ %399 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
512
+ %.not.i31 = icmp eq i32 %399, 0, !dbg !35
513
+ br i1 %.not.i31, label %402, label %400, !dbg !35
514
+
515
+ 400: ; preds = %__nv_exp2f.exit30
516
+ %401 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %339) #6, !dbg !35
517
+ br label %__nv_exp2f.exit33, !dbg !35
518
+
519
+ 402: ; preds = %__nv_exp2f.exit30
520
+ %403 = tail call float @llvm.nvvm.ex2.approx.f(float %339) #6, !dbg !35
521
+ br label %__nv_exp2f.exit33, !dbg !35
522
+
523
+ __nv_exp2f.exit33: ; preds = %400, %402
524
+ %.0.i32 = phi float [ %401, %400 ], [ %403, %402 ], !dbg !35
525
+ %404 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
526
+ %.not.i34 = icmp eq i32 %404, 0, !dbg !35
527
+ br i1 %.not.i34, label %407, label %405, !dbg !35
528
+
529
+ 405: ; preds = %__nv_exp2f.exit33
530
+ %406 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %340) #6, !dbg !35
531
+ br label %__nv_exp2f.exit36, !dbg !35
532
+
533
+ 407: ; preds = %__nv_exp2f.exit33
534
+ %408 = tail call float @llvm.nvvm.ex2.approx.f(float %340) #6, !dbg !35
535
+ br label %__nv_exp2f.exit36, !dbg !35
536
+
537
+ __nv_exp2f.exit36: ; preds = %405, %407
538
+ %.0.i35 = phi float [ %406, %405 ], [ %408, %407 ], !dbg !35
539
+ %409 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
540
+ %.not.i37 = icmp eq i32 %409, 0, !dbg !35
541
+ br i1 %.not.i37, label %412, label %410, !dbg !35
542
+
543
+ 410: ; preds = %__nv_exp2f.exit36
544
+ %411 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %341) #6, !dbg !35
545
+ br label %__nv_exp2f.exit39, !dbg !35
546
+
547
+ 412: ; preds = %__nv_exp2f.exit36
548
+ %413 = tail call float @llvm.nvvm.ex2.approx.f(float %341) #6, !dbg !35
549
+ br label %__nv_exp2f.exit39, !dbg !35
550
+
551
+ __nv_exp2f.exit39: ; preds = %410, %412
552
+ %.0.i38 = phi float [ %411, %410 ], [ %413, %412 ], !dbg !35
553
+ %414 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
554
+ %.not.i40 = icmp eq i32 %414, 0, !dbg !35
555
+ br i1 %.not.i40, label %417, label %415, !dbg !35
556
+
557
+ 415: ; preds = %__nv_exp2f.exit39
558
+ %416 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %342) #6, !dbg !35
559
+ br label %__nv_exp2f.exit42, !dbg !35
560
+
561
+ 417: ; preds = %__nv_exp2f.exit39
562
+ %418 = tail call float @llvm.nvvm.ex2.approx.f(float %342) #6, !dbg !35
563
+ br label %__nv_exp2f.exit42, !dbg !35
564
+
565
+ __nv_exp2f.exit42: ; preds = %415, %417
566
+ %.0.i41 = phi float [ %416, %415 ], [ %418, %417 ], !dbg !35
567
+ %419 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !35
568
+ %.not.i43 = icmp eq i32 %419, 0, !dbg !35
569
+ br i1 %.not.i43, label %422, label %420, !dbg !35
570
+
571
+ 420: ; preds = %__nv_exp2f.exit42
572
+ %421 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %343) #6, !dbg !35
573
+ br label %__nv_exp2f.exit45, !dbg !35
574
+
575
+ 422: ; preds = %__nv_exp2f.exit42
576
+ %423 = tail call float @llvm.nvvm.ex2.approx.f(float %343) #6, !dbg !35
577
+ br label %__nv_exp2f.exit45, !dbg !35
578
+
579
+ __nv_exp2f.exit45: ; preds = %420, %422
580
+ %.0.i44 = phi float [ %421, %420 ], [ %423, %422 ], !dbg !35
581
+ %424 = fmul float %.0.i, %86, !dbg !36
582
+ %425 = fmul float %.0.i2, %87, !dbg !36
583
+ %426 = fmul float %.0.i5, %88, !dbg !36
584
+ %427 = fmul float %.0.i8, %89, !dbg !36
585
+ %428 = fmul float %.0.i11, %95, !dbg !36
586
+ %429 = fmul float %.0.i14, %96, !dbg !36
587
+ %430 = fmul float %.0.i17, %97, !dbg !36
588
+ %431 = fmul float %.0.i20, %98, !dbg !36
589
+ %432 = fmul float %.0.i23, %104, !dbg !36
590
+ %433 = fmul float %.0.i26, %105, !dbg !36
591
+ %434 = fmul float %.0.i29, %106, !dbg !36
592
+ %435 = fmul float %.0.i32, %107, !dbg !36
593
+ %436 = fmul float %.0.i35, %113, !dbg !36
594
+ %437 = fmul float %.0.i38, %114, !dbg !36
595
+ %438 = fmul float %.0.i41, %115, !dbg !36
596
+ %439 = fmul float %.0.i44, %116, !dbg !36
597
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37
598
+ %440 = fadd float %424, %428, !dbg !41
599
+ %441 = fadd float %425, %429, !dbg !41
600
+ %442 = fadd float %426, %430, !dbg !41
601
+ %443 = fadd float %427, %431, !dbg !41
602
+ %444 = fadd float %440, %432, !dbg !41
603
+ %445 = fadd float %441, %433, !dbg !41
604
+ %446 = fadd float %442, %434, !dbg !41
605
+ %447 = fadd float %443, %435, !dbg !41
606
+ %448 = fadd float %444, %436, !dbg !41
607
+ %449 = fadd float %445, %437, !dbg !41
608
+ %450 = fadd float %446, %438, !dbg !41
609
+ %451 = fadd float %447, %439, !dbg !41
610
+ %452 = bitcast float %448 to i32, !dbg !37
611
+ %453 = select i1 %18, i32 %452, i32 0, !dbg !41
612
+ %454 = insertelement <1 x i32> poison, i32 %453, i64 0, !dbg !37
613
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %182, <1 x i32> %454, i1 true) #6, !dbg !37
614
+ %455 = bitcast float %449 to i32, !dbg !37
615
+ %456 = select i1 %18, i32 %455, i32 0, !dbg !41
616
+ %457 = insertelement <1 x i32> poison, i32 %456, i64 0, !dbg !37
617
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %185, <1 x i32> %457, i1 true) #6, !dbg !37
618
+ %458 = bitcast float %450 to i32, !dbg !37
619
+ %459 = select i1 %18, i32 %458, i32 0, !dbg !41
620
+ %460 = insertelement <1 x i32> poison, i32 %459, i64 0, !dbg !37
621
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %460, i1 true) #6, !dbg !37
622
+ %461 = bitcast float %451 to i32, !dbg !37
623
+ %462 = select i1 %18, i32 %461, i32 0, !dbg !41
624
+ %463 = insertelement <1 x i32> poison, i32 %462, i64 0, !dbg !37
625
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %463, i1 true) #6, !dbg !37
626
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37
627
+ %464 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %193, i1 true) #6, !dbg !37
628
+ %465 = bitcast i32 %464 to float, !dbg !37
629
+ %466 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %464, i32 4, i32 31), !dbg !37
630
+ %467 = bitcast i32 %466 to float, !dbg !37
631
+ %468 = fadd float %465, %467, !dbg !41
632
+ %469 = bitcast float %468 to i32, !dbg !37
633
+ %470 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %469, i32 2, i32 31), !dbg !37
634
+ %471 = bitcast i32 %470 to float, !dbg !37
635
+ %472 = fadd float %468, %471, !dbg !41
636
+ %473 = bitcast float %472 to i32, !dbg !37
637
+ %474 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %473, i32 1, i32 31), !dbg !37
638
+ %475 = bitcast i32 %474 to float, !dbg !37
639
+ %476 = fadd float %472, %475, !dbg !41
640
+ %477 = bitcast float %476 to <1 x i32>, !dbg !37
641
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %193, <1 x i32> %477, i1 %216) #6, !dbg !37
642
+ %478 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %219, i1 true) #6, !dbg !37
643
+ %479 = bitcast i32 %478 to float, !dbg !37
644
+ %480 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %478, i32 4, i32 31), !dbg !37
645
+ %481 = bitcast i32 %480 to float, !dbg !37
646
+ %482 = fadd float %479, %481, !dbg !41
647
+ %483 = bitcast float %482 to i32, !dbg !37
648
+ %484 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %483, i32 2, i32 31), !dbg !37
649
+ %485 = bitcast i32 %484 to float, !dbg !37
650
+ %486 = fadd float %482, %485, !dbg !41
651
+ %487 = bitcast float %486 to i32, !dbg !37
652
+ %488 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %487, i32 1, i32 31), !dbg !37
653
+ %489 = bitcast i32 %488 to float, !dbg !37
654
+ %490 = fadd float %486, %489, !dbg !41
655
+ %491 = bitcast float %490 to <1 x i32>, !dbg !37
656
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %219, <1 x i32> %491, i1 %216) #6, !dbg !37
657
+ %492 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %243, i1 true) #6, !dbg !37
658
+ %493 = bitcast i32 %492 to float, !dbg !37
659
+ %494 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %492, i32 4, i32 31), !dbg !37
660
+ %495 = bitcast i32 %494 to float, !dbg !37
661
+ %496 = fadd float %493, %495, !dbg !41
662
+ %497 = bitcast float %496 to i32, !dbg !37
663
+ %498 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %497, i32 2, i32 31), !dbg !37
664
+ %499 = bitcast i32 %498 to float, !dbg !37
665
+ %500 = fadd float %496, %499, !dbg !41
666
+ %501 = bitcast float %500 to i32, !dbg !37
667
+ %502 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %501, i32 1, i32 31), !dbg !37
668
+ %503 = bitcast i32 %502 to float, !dbg !37
669
+ %504 = fadd float %500, %503, !dbg !41
670
+ %505 = bitcast float %504 to <1 x i32>, !dbg !37
671
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %243, <1 x i32> %505, i1 %216) #6, !dbg !37
672
+ %506 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %267, i1 true) #6, !dbg !37
673
+ %507 = bitcast i32 %506 to float, !dbg !37
674
+ %508 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %506, i32 4, i32 31), !dbg !37
675
+ %509 = bitcast i32 %508 to float, !dbg !37
676
+ %510 = fadd float %507, %509, !dbg !41
677
+ %511 = bitcast float %510 to i32, !dbg !37
678
+ %512 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %511, i32 2, i32 31), !dbg !37
679
+ %513 = bitcast i32 %512 to float, !dbg !37
680
+ %514 = fadd float %510, %513, !dbg !41
681
+ %515 = bitcast float %514 to i32, !dbg !37
682
+ %516 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %515, i32 1, i32 31), !dbg !37
683
+ %517 = bitcast i32 %516 to float, !dbg !37
684
+ %518 = fadd float %514, %517, !dbg !41
685
+ %519 = bitcast float %518 to <1 x i32>, !dbg !37
686
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %267, <1 x i32> %519, i1 %216) #6, !dbg !37
687
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37
688
+ %520 = load float, ptr addrspace(3) %181, align 16, !dbg !37
689
+ %521 = load float, ptr addrspace(3) %184, align 16, !dbg !37
690
+ %522 = load float, ptr addrspace(3) %187, align 16, !dbg !37
691
+ %523 = load float, ptr addrspace(3) %190, align 16, !dbg !37
692
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !42
693
+ %524 = insertelement <4 x float> poison, float %520, i64 0, !dbg !42
694
+ %525 = insertelement <4 x float> %524, float %521, i64 1, !dbg !42
695
+ %526 = insertelement <4 x float> %525, float %522, i64 2, !dbg !42
696
+ %527 = insertelement <4 x float> %526, float %523, i64 3, !dbg !42
697
+ store <4 x float> %527, ptr addrspace(3) %296, align 16, !dbg !42
698
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !42
699
+ %528 = tail call i32 asm sideeffect "ldmatrix.sync.aligned.m8n8.x1.shared.b16 {$0}, [$1];", "=r,r"(i32 %306) #6, !dbg !42
700
+ %529 = select i1 %308, float 1.000000e+00, float %520, !dbg !43
701
+ %530 = select i1 %309, float 1.000000e+00, float %521, !dbg !43
702
+ %531 = select i1 %310, float 1.000000e+00, float %522, !dbg !43
703
+ %532 = select i1 %311, float 1.000000e+00, float %523, !dbg !43
704
+ %533 = fcmp olt float %529, 0x3810000000000000, !dbg !44
705
+ %534 = fmul float %529, 0x4160000000000000, !dbg !44
706
+ %.02.i = select i1 %533, float %534, float %529, !dbg !44
707
+ %i.i.0.i = select i1 %533, float -2.300000e+01, float 0.000000e+00, !dbg !44
708
+ %535 = bitcast float %.02.i to i32, !dbg !44
709
+ %536 = add i32 %535, -1060439283, !dbg !44
710
+ %537 = and i32 %536, -8388608, !dbg !44
711
+ %538 = sub i32 %535, %537, !dbg !44
712
+ %539 = bitcast i32 %538 to float, !dbg !44
713
+ %540 = sitofp i32 %537 to float, !dbg !44
714
+ %541 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
715
+ %.not.i46 = icmp eq i32 %541, 0, !dbg !44
716
+ %542 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %540, float 0x3E80000000000000, float %i.i.0.i) #6, !dbg !44
717
+ %543 = tail call float @llvm.nvvm.fma.rn.f(float %540, float 0x3E80000000000000, float %i.i.0.i) #6, !dbg !44
718
+ %.08.i = select i1 %.not.i46, float %543, float %542, !dbg !44
719
+ %544 = fadd float %539, -1.000000e+00, !dbg !44
720
+ %545 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
721
+ %.not1.i = icmp eq i32 %545, 0, !dbg !44
722
+ %546 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0x3FB8D64FE0000000, float %544, float 0xBFC58FE600000000) #6, !dbg !44
723
+ %547 = tail call float @llvm.nvvm.fma.rn.f(float 0x3FB8D64FE0000000, float %544, float 0xBFC58FE600000000) #6, !dbg !44
724
+ %.010.i = select i1 %.not1.i, float %547, float %546, !dbg !44
725
+ %548 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
726
+ %.not2.i = icmp eq i32 %548, 0, !dbg !44
727
+ %549 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i, float %544, float 0x3FC5F9E540000000) #6, !dbg !44
728
+ %550 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i, float %544, float 0x3FC5F9E540000000) #6, !dbg !44
729
+ %.011.i = select i1 %.not2.i, float %550, float %549, !dbg !44
730
+ %551 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
731
+ %.not3.i = icmp eq i32 %551, 0, !dbg !44
732
+ %552 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i, float %544, float 0xBFC6E9C860000000) #6, !dbg !44
733
+ %553 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i, float %544, float 0xBFC6E9C860000000) #6, !dbg !44
734
+ %.012.i = select i1 %.not3.i, float %553, float %552, !dbg !44
735
+ %554 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
736
+ %.not4.i = icmp eq i32 %554, 0, !dbg !44
737
+ %555 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i, float %544, float 0x3FCA417E80000000) #6, !dbg !44
738
+ %556 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i, float %544, float 0x3FCA417E80000000) #6, !dbg !44
739
+ %.09.i = select i1 %.not4.i, float %556, float %555, !dbg !44
740
+ %557 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
741
+ %.not5.i = icmp eq i32 %557, 0, !dbg !44
742
+ %558 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i, float %544, float 0xBFCEC79160000000) #6, !dbg !44
743
+ %559 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i, float %544, float 0xBFCEC79160000000) #6, !dbg !44
744
+ %.05.i = select i1 %.not5.i, float %559, float %558, !dbg !44
745
+ %560 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
746
+ %.not6.i = icmp eq i32 %560, 0, !dbg !44
747
+ %561 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %544, float 0x3FD277F320000000) #6, !dbg !44
748
+ %562 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %544, float 0x3FD277F320000000) #6, !dbg !44
749
+ %.01.i = select i1 %.not6.i, float %562, float %561, !dbg !44
750
+ %563 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
751
+ %.not7.i = icmp eq i32 %563, 0, !dbg !44
752
+ %564 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i, float %544, float 0xBFD7154920000000) #6, !dbg !44
753
+ %565 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i, float %544, float 0xBFD7154920000000) #6, !dbg !44
754
+ %.0.i47 = select i1 %.not7.i, float %565, float %564, !dbg !44
755
+ %566 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
756
+ %.not8.i = icmp eq i32 %566, 0, !dbg !44
757
+ %567 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i47, float %544, float 0x3FDEC70940000000) #6, !dbg !44
758
+ %568 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i47, float %544, float 0x3FDEC70940000000) #6, !dbg !44
759
+ %.07.i = select i1 %.not8.i, float %568, float %567, !dbg !44
760
+ %569 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
761
+ %.not9.i = icmp eq i32 %569, 0, !dbg !44
762
+ %570 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %544, float 0xBFE7154760000000) #6, !dbg !44
763
+ %571 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %544, float 0xBFE7154760000000) #6, !dbg !44
764
+ %.06.i = select i1 %.not9.i, float %571, float %570, !dbg !44
765
+ %572 = fmul float %544, %.06.i, !dbg !44
766
+ %573 = fmul float %544, %572, !dbg !44
767
+ %574 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
768
+ %.not10.i = icmp eq i32 %574, 0, !dbg !44
769
+ %575 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %544, float 0x3FF7154760000000, float %573) #6, !dbg !44
770
+ %576 = tail call float @llvm.nvvm.fma.rn.f(float %544, float 0x3FF7154760000000, float %573) #6, !dbg !44
771
+ %.04.i = select i1 %.not10.i, float %576, float %575, !dbg !44
772
+ %577 = fadd float %.08.i, %.04.i, !dbg !44
773
+ %578 = icmp ugt i32 %535, 2139095039, !dbg !44
774
+ br i1 %578, label %__nv_fmaf_rn.exit.i.i, label %__nv_log2f.exit, !dbg !44
775
+
776
+ __nv_fmaf_rn.exit.i.i: ; preds = %__nv_exp2f.exit45
777
+ %579 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
778
+ %.not11.i = icmp eq i32 %579, 0, !dbg !44
779
+ %580 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !44
780
+ %581 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !44
781
+ %.03.i = select i1 %.not11.i, float %581, float %580, !dbg !44
782
+ br label %__nv_log2f.exit, !dbg !44
783
+
784
+ __nv_log2f.exit: ; preds = %__nv_exp2f.exit45, %__nv_fmaf_rn.exit.i.i
785
+ %r.i.0.i = phi float [ %.03.i, %__nv_fmaf_rn.exit.i.i ], [ %577, %__nv_exp2f.exit45 ], !dbg !44
786
+ %582 = fcmp olt float %530, 0x3810000000000000, !dbg !44
787
+ %583 = fmul float %530, 0x4160000000000000, !dbg !44
788
+ %.02.i48 = select i1 %582, float %583, float %530, !dbg !44
789
+ %i.i.0.i49 = select i1 %582, float -2.300000e+01, float 0.000000e+00, !dbg !44
790
+ %584 = bitcast float %.02.i48 to i32, !dbg !44
791
+ %585 = add i32 %584, -1060439283, !dbg !44
792
+ %586 = and i32 %585, -8388608, !dbg !44
793
+ %587 = sub i32 %584, %586, !dbg !44
794
+ %588 = bitcast i32 %587 to float, !dbg !44
795
+ %589 = sitofp i32 %586 to float, !dbg !44
796
+ %590 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
797
+ %.not.i50 = icmp eq i32 %590, 0, !dbg !44
798
+ %591 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %589, float 0x3E80000000000000, float %i.i.0.i49) #6, !dbg !44
799
+ %592 = tail call float @llvm.nvvm.fma.rn.f(float %589, float 0x3E80000000000000, float %i.i.0.i49) #6, !dbg !44
800
+ %.08.i51 = select i1 %.not.i50, float %592, float %591, !dbg !44
801
+ %593 = fadd float %588, -1.000000e+00, !dbg !44
802
+ %594 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
803
+ %.not1.i52 = icmp eq i32 %594, 0, !dbg !44
804
+ %595 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0x3FB8D64FE0000000, float %593, float 0xBFC58FE600000000) #6, !dbg !44
805
+ %596 = tail call float @llvm.nvvm.fma.rn.f(float 0x3FB8D64FE0000000, float %593, float 0xBFC58FE600000000) #6, !dbg !44
806
+ %.010.i53 = select i1 %.not1.i52, float %596, float %595, !dbg !44
807
+ %597 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
808
+ %.not2.i54 = icmp eq i32 %597, 0, !dbg !44
809
+ %598 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i53, float %593, float 0x3FC5F9E540000000) #6, !dbg !44
810
+ %599 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i53, float %593, float 0x3FC5F9E540000000) #6, !dbg !44
811
+ %.011.i55 = select i1 %.not2.i54, float %599, float %598, !dbg !44
812
+ %600 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
813
+ %.not3.i56 = icmp eq i32 %600, 0, !dbg !44
814
+ %601 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i55, float %593, float 0xBFC6E9C860000000) #6, !dbg !44
815
+ %602 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i55, float %593, float 0xBFC6E9C860000000) #6, !dbg !44
816
+ %.012.i57 = select i1 %.not3.i56, float %602, float %601, !dbg !44
817
+ %603 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
818
+ %.not4.i58 = icmp eq i32 %603, 0, !dbg !44
819
+ %604 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i57, float %593, float 0x3FCA417E80000000) #6, !dbg !44
820
+ %605 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i57, float %593, float 0x3FCA417E80000000) #6, !dbg !44
821
+ %.09.i59 = select i1 %.not4.i58, float %605, float %604, !dbg !44
822
+ %606 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
823
+ %.not5.i60 = icmp eq i32 %606, 0, !dbg !44
824
+ %607 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i59, float %593, float 0xBFCEC79160000000) #6, !dbg !44
825
+ %608 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i59, float %593, float 0xBFCEC79160000000) #6, !dbg !44
826
+ %.05.i61 = select i1 %.not5.i60, float %608, float %607, !dbg !44
827
+ %609 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
828
+ %.not6.i62 = icmp eq i32 %609, 0, !dbg !44
829
+ %610 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i61, float %593, float 0x3FD277F320000000) #6, !dbg !44
830
+ %611 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i61, float %593, float 0x3FD277F320000000) #6, !dbg !44
831
+ %.01.i63 = select i1 %.not6.i62, float %611, float %610, !dbg !44
832
+ %612 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
833
+ %.not7.i64 = icmp eq i32 %612, 0, !dbg !44
834
+ %613 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i63, float %593, float 0xBFD7154920000000) #6, !dbg !44
835
+ %614 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i63, float %593, float 0xBFD7154920000000) #6, !dbg !44
836
+ %.0.i65 = select i1 %.not7.i64, float %614, float %613, !dbg !44
837
+ %615 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
838
+ %.not8.i66 = icmp eq i32 %615, 0, !dbg !44
839
+ %616 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i65, float %593, float 0x3FDEC70940000000) #6, !dbg !44
840
+ %617 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i65, float %593, float 0x3FDEC70940000000) #6, !dbg !44
841
+ %.07.i67 = select i1 %.not8.i66, float %617, float %616, !dbg !44
842
+ %618 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
843
+ %.not9.i68 = icmp eq i32 %618, 0, !dbg !44
844
+ %619 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i67, float %593, float 0xBFE7154760000000) #6, !dbg !44
845
+ %620 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i67, float %593, float 0xBFE7154760000000) #6, !dbg !44
846
+ %.06.i69 = select i1 %.not9.i68, float %620, float %619, !dbg !44
847
+ %621 = fmul float %593, %.06.i69, !dbg !44
848
+ %622 = fmul float %593, %621, !dbg !44
849
+ %623 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
850
+ %.not10.i70 = icmp eq i32 %623, 0, !dbg !44
851
+ %624 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %593, float 0x3FF7154760000000, float %622) #6, !dbg !44
852
+ %625 = tail call float @llvm.nvvm.fma.rn.f(float %593, float 0x3FF7154760000000, float %622) #6, !dbg !44
853
+ %.04.i71 = select i1 %.not10.i70, float %625, float %624, !dbg !44
854
+ %626 = fadd float %.08.i51, %.04.i71, !dbg !44
855
+ %627 = icmp ugt i32 %584, 2139095039, !dbg !44
856
+ br i1 %627, label %__nv_fmaf_rn.exit.i.i74, label %__nv_log2f.exit77, !dbg !44
857
+
858
+ __nv_fmaf_rn.exit.i.i74: ; preds = %__nv_log2f.exit
859
+ %628 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
860
+ %.not11.i75 = icmp eq i32 %628, 0, !dbg !44
861
+ %629 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i48, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !44
862
+ %630 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i48, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !44
863
+ %.03.i76 = select i1 %.not11.i75, float %630, float %629, !dbg !44
864
+ br label %__nv_log2f.exit77, !dbg !44
865
+
866
+ __nv_log2f.exit77: ; preds = %__nv_log2f.exit, %__nv_fmaf_rn.exit.i.i74
867
+ %r.i.0.i72 = phi float [ %.03.i76, %__nv_fmaf_rn.exit.i.i74 ], [ %626, %__nv_log2f.exit ], !dbg !44
868
+ %631 = fcmp olt float %531, 0x3810000000000000, !dbg !44
869
+ %632 = fmul float %531, 0x4160000000000000, !dbg !44
870
+ %.02.i78 = select i1 %631, float %632, float %531, !dbg !44
871
+ %i.i.0.i79 = select i1 %631, float -2.300000e+01, float 0.000000e+00, !dbg !44
872
+ %633 = bitcast float %.02.i78 to i32, !dbg !44
873
+ %634 = add i32 %633, -1060439283, !dbg !44
874
+ %635 = and i32 %634, -8388608, !dbg !44
875
+ %636 = sub i32 %633, %635, !dbg !44
876
+ %637 = bitcast i32 %636 to float, !dbg !44
877
+ %638 = sitofp i32 %635 to float, !dbg !44
878
+ %639 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
879
+ %.not.i80 = icmp eq i32 %639, 0, !dbg !44
880
+ %640 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %638, float 0x3E80000000000000, float %i.i.0.i79) #6, !dbg !44
881
+ %641 = tail call float @llvm.nvvm.fma.rn.f(float %638, float 0x3E80000000000000, float %i.i.0.i79) #6, !dbg !44
882
+ %.08.i81 = select i1 %.not.i80, float %641, float %640, !dbg !44
883
+ %642 = fadd float %637, -1.000000e+00, !dbg !44
884
+ %643 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
885
+ %.not1.i82 = icmp eq i32 %643, 0, !dbg !44
886
+ %644 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0x3FB8D64FE0000000, float %642, float 0xBFC58FE600000000) #6, !dbg !44
887
+ %645 = tail call float @llvm.nvvm.fma.rn.f(float 0x3FB8D64FE0000000, float %642, float 0xBFC58FE600000000) #6, !dbg !44
888
+ %.010.i83 = select i1 %.not1.i82, float %645, float %644, !dbg !44
889
+ %646 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
890
+ %.not2.i84 = icmp eq i32 %646, 0, !dbg !44
891
+ %647 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i83, float %642, float 0x3FC5F9E540000000) #6, !dbg !44
892
+ %648 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i83, float %642, float 0x3FC5F9E540000000) #6, !dbg !44
893
+ %.011.i85 = select i1 %.not2.i84, float %648, float %647, !dbg !44
894
+ %649 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
895
+ %.not3.i86 = icmp eq i32 %649, 0, !dbg !44
896
+ %650 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i85, float %642, float 0xBFC6E9C860000000) #6, !dbg !44
897
+ %651 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i85, float %642, float 0xBFC6E9C860000000) #6, !dbg !44
898
+ %.012.i87 = select i1 %.not3.i86, float %651, float %650, !dbg !44
899
+ %652 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
900
+ %.not4.i88 = icmp eq i32 %652, 0, !dbg !44
901
+ %653 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i87, float %642, float 0x3FCA417E80000000) #6, !dbg !44
902
+ %654 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i87, float %642, float 0x3FCA417E80000000) #6, !dbg !44
903
+ %.09.i89 = select i1 %.not4.i88, float %654, float %653, !dbg !44
904
+ %655 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
905
+ %.not5.i90 = icmp eq i32 %655, 0, !dbg !44
906
+ %656 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i89, float %642, float 0xBFCEC79160000000) #6, !dbg !44
907
+ %657 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i89, float %642, float 0xBFCEC79160000000) #6, !dbg !44
908
+ %.05.i91 = select i1 %.not5.i90, float %657, float %656, !dbg !44
909
+ %658 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
910
+ %.not6.i92 = icmp eq i32 %658, 0, !dbg !44
911
+ %659 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i91, float %642, float 0x3FD277F320000000) #6, !dbg !44
912
+ %660 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i91, float %642, float 0x3FD277F320000000) #6, !dbg !44
913
+ %.01.i93 = select i1 %.not6.i92, float %660, float %659, !dbg !44
914
+ %661 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
915
+ %.not7.i94 = icmp eq i32 %661, 0, !dbg !44
916
+ %662 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i93, float %642, float 0xBFD7154920000000) #6, !dbg !44
917
+ %663 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i93, float %642, float 0xBFD7154920000000) #6, !dbg !44
918
+ %.0.i95 = select i1 %.not7.i94, float %663, float %662, !dbg !44
919
+ %664 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
920
+ %.not8.i96 = icmp eq i32 %664, 0, !dbg !44
921
+ %665 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i95, float %642, float 0x3FDEC70940000000) #6, !dbg !44
922
+ %666 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i95, float %642, float 0x3FDEC70940000000) #6, !dbg !44
923
+ %.07.i97 = select i1 %.not8.i96, float %666, float %665, !dbg !44
924
+ %667 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
925
+ %.not9.i98 = icmp eq i32 %667, 0, !dbg !44
926
+ %668 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i97, float %642, float 0xBFE7154760000000) #6, !dbg !44
927
+ %669 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i97, float %642, float 0xBFE7154760000000) #6, !dbg !44
928
+ %.06.i99 = select i1 %.not9.i98, float %669, float %668, !dbg !44
929
+ %670 = fmul float %642, %.06.i99, !dbg !44
930
+ %671 = fmul float %642, %670, !dbg !44
931
+ %672 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
932
+ %.not10.i100 = icmp eq i32 %672, 0, !dbg !44
933
+ %673 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %642, float 0x3FF7154760000000, float %671) #6, !dbg !44
934
+ %674 = tail call float @llvm.nvvm.fma.rn.f(float %642, float 0x3FF7154760000000, float %671) #6, !dbg !44
935
+ %.04.i101 = select i1 %.not10.i100, float %674, float %673, !dbg !44
936
+ %675 = fadd float %.08.i81, %.04.i101, !dbg !44
937
+ %676 = icmp ugt i32 %633, 2139095039, !dbg !44
938
+ br i1 %676, label %__nv_fmaf_rn.exit.i.i104, label %__nv_log2f.exit107, !dbg !44
939
+
940
+ __nv_fmaf_rn.exit.i.i104: ; preds = %__nv_log2f.exit77
941
+ %677 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
942
+ %.not11.i105 = icmp eq i32 %677, 0, !dbg !44
943
+ %678 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i78, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !44
944
+ %679 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i78, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !44
945
+ %.03.i106 = select i1 %.not11.i105, float %679, float %678, !dbg !44
946
+ br label %__nv_log2f.exit107, !dbg !44
947
+
948
+ __nv_log2f.exit107: ; preds = %__nv_log2f.exit77, %__nv_fmaf_rn.exit.i.i104
949
+ %r.i.0.i102 = phi float [ %.03.i106, %__nv_fmaf_rn.exit.i.i104 ], [ %675, %__nv_log2f.exit77 ], !dbg !44
950
+ %680 = fcmp olt float %532, 0x3810000000000000, !dbg !44
951
+ %681 = fmul float %532, 0x4160000000000000, !dbg !44
952
+ %.02.i108 = select i1 %680, float %681, float %532, !dbg !44
953
+ %i.i.0.i109 = select i1 %680, float -2.300000e+01, float 0.000000e+00, !dbg !44
954
+ %682 = bitcast float %.02.i108 to i32, !dbg !44
955
+ %683 = add i32 %682, -1060439283, !dbg !44
956
+ %684 = and i32 %683, -8388608, !dbg !44
957
+ %685 = sub i32 %682, %684, !dbg !44
958
+ %686 = bitcast i32 %685 to float, !dbg !44
959
+ %687 = sitofp i32 %684 to float, !dbg !44
960
+ %688 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
961
+ %.not.i110 = icmp eq i32 %688, 0, !dbg !44
962
+ %689 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %687, float 0x3E80000000000000, float %i.i.0.i109) #6, !dbg !44
963
+ %690 = tail call float @llvm.nvvm.fma.rn.f(float %687, float 0x3E80000000000000, float %i.i.0.i109) #6, !dbg !44
964
+ %.08.i111 = select i1 %.not.i110, float %690, float %689, !dbg !44
965
+ %691 = fadd float %686, -1.000000e+00, !dbg !44
966
+ %692 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
967
+ %.not1.i112 = icmp eq i32 %692, 0, !dbg !44
968
+ %693 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0x3FB8D64FE0000000, float %691, float 0xBFC58FE600000000) #6, !dbg !44
969
+ %694 = tail call float @llvm.nvvm.fma.rn.f(float 0x3FB8D64FE0000000, float %691, float 0xBFC58FE600000000) #6, !dbg !44
970
+ %.010.i113 = select i1 %.not1.i112, float %694, float %693, !dbg !44
971
+ %695 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
972
+ %.not2.i114 = icmp eq i32 %695, 0, !dbg !44
973
+ %696 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i113, float %691, float 0x3FC5F9E540000000) #6, !dbg !44
974
+ %697 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i113, float %691, float 0x3FC5F9E540000000) #6, !dbg !44
975
+ %.011.i115 = select i1 %.not2.i114, float %697, float %696, !dbg !44
976
+ %698 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
977
+ %.not3.i116 = icmp eq i32 %698, 0, !dbg !44
978
+ %699 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i115, float %691, float 0xBFC6E9C860000000) #6, !dbg !44
979
+ %700 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i115, float %691, float 0xBFC6E9C860000000) #6, !dbg !44
980
+ %.012.i117 = select i1 %.not3.i116, float %700, float %699, !dbg !44
981
+ %701 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
982
+ %.not4.i118 = icmp eq i32 %701, 0, !dbg !44
983
+ %702 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i117, float %691, float 0x3FCA417E80000000) #6, !dbg !44
984
+ %703 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i117, float %691, float 0x3FCA417E80000000) #6, !dbg !44
985
+ %.09.i119 = select i1 %.not4.i118, float %703, float %702, !dbg !44
986
+ %704 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
987
+ %.not5.i120 = icmp eq i32 %704, 0, !dbg !44
988
+ %705 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i119, float %691, float 0xBFCEC79160000000) #6, !dbg !44
989
+ %706 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i119, float %691, float 0xBFCEC79160000000) #6, !dbg !44
990
+ %.05.i121 = select i1 %.not5.i120, float %706, float %705, !dbg !44
991
+ %707 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
992
+ %.not6.i122 = icmp eq i32 %707, 0, !dbg !44
993
+ %708 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i121, float %691, float 0x3FD277F320000000) #6, !dbg !44
994
+ %709 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i121, float %691, float 0x3FD277F320000000) #6, !dbg !44
995
+ %.01.i123 = select i1 %.not6.i122, float %709, float %708, !dbg !44
996
+ %710 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
997
+ %.not7.i124 = icmp eq i32 %710, 0, !dbg !44
998
+ %711 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i123, float %691, float 0xBFD7154920000000) #6, !dbg !44
999
+ %712 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i123, float %691, float 0xBFD7154920000000) #6, !dbg !44
1000
+ %.0.i125 = select i1 %.not7.i124, float %712, float %711, !dbg !44
1001
+ %713 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
1002
+ %.not8.i126 = icmp eq i32 %713, 0, !dbg !44
1003
+ %714 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i125, float %691, float 0x3FDEC70940000000) #6, !dbg !44
1004
+ %715 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i125, float %691, float 0x3FDEC70940000000) #6, !dbg !44
1005
+ %.07.i127 = select i1 %.not8.i126, float %715, float %714, !dbg !44
1006
+ %716 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
1007
+ %.not9.i128 = icmp eq i32 %716, 0, !dbg !44
1008
+ %717 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i127, float %691, float 0xBFE7154760000000) #6, !dbg !44
1009
+ %718 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i127, float %691, float 0xBFE7154760000000) #6, !dbg !44
1010
+ %.06.i129 = select i1 %.not9.i128, float %718, float %717, !dbg !44
1011
+ %719 = fmul float %691, %.06.i129, !dbg !44
1012
+ %720 = fmul float %691, %719, !dbg !44
1013
+ %721 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
1014
+ %.not10.i130 = icmp eq i32 %721, 0, !dbg !44
1015
+ %722 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %691, float 0x3FF7154760000000, float %720) #6, !dbg !44
1016
+ %723 = tail call float @llvm.nvvm.fma.rn.f(float %691, float 0x3FF7154760000000, float %720) #6, !dbg !44
1017
+ %.04.i131 = select i1 %.not10.i130, float %723, float %722, !dbg !44
1018
+ %724 = fadd float %.08.i111, %.04.i131, !dbg !44
1019
+ %725 = icmp ugt i32 %682, 2139095039, !dbg !44
1020
+ br i1 %725, label %__nv_fmaf_rn.exit.i.i134, label %__nv_log2f.exit137, !dbg !44
1021
+
1022
+ __nv_fmaf_rn.exit.i.i134: ; preds = %__nv_log2f.exit107
1023
+ %726 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
1024
+ %.not11.i135 = icmp eq i32 %726, 0, !dbg !44
1025
+ %727 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i108, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !44
1026
+ %728 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i108, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !44
1027
+ %.03.i136 = select i1 %.not11.i135, float %728, float %727, !dbg !44
1028
+ br label %__nv_log2f.exit137, !dbg !44
1029
+
1030
+ __nv_log2f.exit137: ; preds = %__nv_log2f.exit107, %__nv_fmaf_rn.exit.i.i134
1031
+ %r.i.0.i132 = phi float [ %.03.i136, %__nv_fmaf_rn.exit.i.i134 ], [ %724, %__nv_log2f.exit107 ], !dbg !44
1032
+ %729 = insertelement <4 x float> poison, float %.02.i, i64 0, !dbg !44
1033
+ %730 = insertelement <4 x float> %729, float %.02.i48, i64 1, !dbg !44
1034
+ %731 = insertelement <4 x float> %730, float %.02.i78, i64 2, !dbg !44
1035
+ %732 = insertelement <4 x float> %731, float %.02.i108, i64 3, !dbg !44
1036
+ %733 = fcmp oeq <4 x float> %732, zeroinitializer, !dbg !44
1037
+ %734 = and i32 %13, 127, !dbg !10
1038
+ %735 = or disjoint i32 %12, %734, !dbg !11
1039
+ %736 = sext i32 %735 to i64, !dbg !14
1040
+ %.frozen = freeze i64 %5, !dbg !45
1041
+ %737 = sdiv i64 %736, %.frozen, !dbg !45
1042
+ %738 = icmp slt i32 %12, 0, !dbg !47
1043
+ %739 = icmp slt i64 %5, 0, !dbg !48
1044
+ %740 = xor i1 %738, %739, !dbg !49
1045
+ %741 = mul i64 %737, %.frozen, !dbg !14
1046
+ %.decomposed = sub i64 %736, %741, !dbg !14
1047
+ %.not = icmp ne i64 %.decomposed, 0, !dbg !50
1048
+ %narrow = select i1 %740, i1 %.not, i1 false, !dbg !51
1049
+ %742 = sext i1 %narrow to i64, !dbg !51
1050
+ %743 = add nsw i64 %737, %742, !dbg !51
1051
+ %744 = icmp slt i32 %735, %6, !dbg !12
1052
+ %745 = insertelement <4 x float> poison, float %r.i.0.i, i64 0, !dbg !44
1053
+ %746 = insertelement <4 x float> %745, float %r.i.0.i72, i64 1, !dbg !44
1054
+ %747 = insertelement <4 x float> %746, float %r.i.0.i102, i64 2, !dbg !44
1055
+ %748 = insertelement <4 x float> %747, float %r.i.0.i132, i64 3, !dbg !44
1056
+ %749 = select <4 x i1> %733, <4 x float> splat (float 0xFFF0000000000000), <4 x float> %748, !dbg !44
1057
+ %750 = insertelement <4 x float> poison, float %291, i64 0, !dbg !52
1058
+ %751 = insertelement <4 x float> %750, float %292, i64 1, !dbg !52
1059
+ %752 = insertelement <4 x float> %751, float %293, i64 2, !dbg !52
1060
+ %753 = insertelement <4 x float> %752, float %294, i64 3, !dbg !52
1061
+ %754 = fadd <4 x float> %753, %749, !dbg !52
1062
+ %755 = fmul <4 x float> %754, splat (float 0x3FE62E4300000000), !dbg !53
1063
+ %756 = icmp slt i64 %5, 2, !dbg !54
1064
+ %757 = icmp sgt i64 %5, 1, !dbg !55
1065
+ %758 = select i1 %757, i64 %5, i64 0, !dbg !56
1066
+ %759 = zext i1 %756 to i64, !dbg !57
1067
+ %760 = add i64 %758, %759, !dbg !58
1068
+ %761 = mul i64 %743, %760, !dbg !59
1069
+ %762 = getelementptr float, ptr addrspace(1) %4, i64 %.decomposed, !dbg !60
1070
+ %763 = getelementptr float, ptr addrspace(1) %762, i64 %761, !dbg !60
1071
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61
1072
+ store <4 x float> %755, ptr addrspace(3) %296, align 16, !dbg !61
1073
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61
1074
+ %764 = tail call i32 asm sideeffect "ldmatrix.sync.aligned.m8n8.x1.shared.b16 {$0}, [$1];", "=r,r"(i32 %306) #6, !dbg !61
1075
+ %765 = and i32 %13, 128, !dbg !61
1076
+ %766 = icmp eq i32 %765, 0, !dbg !61
1077
+ %767 = and i1 %766, %744, !dbg !61
1078
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %764, ptr addrspace(1) %763, i1 %767) #6, !dbg !61
1079
+ %768 = getelementptr float, ptr addrspace(1) %2, i64 %736, !dbg !62
1080
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %307, ptr addrspace(1) %768, i1 %767) #6, !dbg !63
1081
+ %769 = getelementptr float, ptr addrspace(1) %3, i64 %736, !dbg !64
1082
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %528, ptr addrspace(1) %769, i1 %767) #6, !dbg !65
1083
+ ret void, !dbg !66
1084
+ }
1085
+
1086
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
1087
+ declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
1088
+
1089
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
1090
+ declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
1091
+
1092
+ ; Function Attrs: convergent nocallback nounwind
1093
+ declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2
1094
+
1095
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
1096
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3
1097
+
1098
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
1099
+
1100
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
1101
+ declare float @llvm.nvvm.ex2.approx.ftz.f(float) #5
1102
+
1103
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
1104
+ declare float @llvm.nvvm.ex2.approx.f(float) #5
1105
+
1106
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
1107
+ declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #1
1108
+
1109
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
1110
+ declare float @llvm.nvvm.fma.rn.f(float, float, float) #1
1111
+
1112
+ attributes #0 = { nounwind "nvvm.reqntid"="256" }
1113
+ attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
1114
+ attributes #2 = { convergent nocallback nounwind }
1115
+ attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
1116
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
1117
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
1118
+ attributes #6 = { nounwind }
1119
+
1120
+ !llvm.dbg.cu = !{!0}
1121
+ !llvm.module.flags = !{!2, !3}
1122
+ !llvm.ident = !{!4}
1123
+
1124
+ !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
1125
+ !1 = !DIFile(filename: "c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py", directory: "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g")
1126
+ !2 = !{i32 2, !"Debug Info Version", i32 3}
1127
+ !3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
1128
+ !4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
1129
+ !5 = distinct !DISubprogram(name: "triton_per_fused_mul_1", linkageName: "triton_per_fused_mul_1", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
1130
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
1131
+ !7 = !{}
1132
+ !8 = !DILocation(line: 23, column: 28, scope: !5)
1133
+ !9 = !DILocation(line: 23, column: 33, scope: !5)
1134
+ !10 = !DILocation(line: 24, column: 44, scope: !5)
1135
+ !11 = !DILocation(line: 24, column: 23, scope: !5)
1136
+ !12 = !DILocation(line: 25, column: 21, scope: !5)
1137
+ !13 = !DILocation(line: 26, column: 38, scope: !5)
1138
+ !14 = !DILocation(line: 33, column: 19, scope: !5)
1139
+ !15 = !DILocation(line: 35, column: 38, scope: !5)
1140
+ !16 = !DILocation(line: 35, column: 42, scope: !5)
1141
+ !17 = !DILocation(line: 35, column: 35, scope: !5)
1142
+ !18 = !DILocation(line: 35, column: 30, scope: !5)
1143
+ !19 = !DILocation(line: 35, column: 49, scope: !5)
1144
+ !20 = !DILocation(line: 36, column: 30, scope: !5)
1145
+ !21 = !DILocation(line: 36, column: 49, scope: !5)
1146
+ !22 = !DILocation(line: 38, column: 33, scope: !5)
1147
+ !23 = !DILocation(line: 110, column: 15, scope: !24, inlinedAt: !26)
1148
+ !24 = distinct !DILexicalBlockFile(scope: !5, file: !25, discriminator: 0)
1149
+ !25 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime")
1150
+ !26 = !DILocation(line: 39, column: 37, scope: !5)
1151
+ !27 = !DILocation(line: 112, column: 21, scope: !24, inlinedAt: !26)
1152
+ !28 = !DILocation(line: 112, column: 16, scope: !24, inlinedAt: !26)
1153
+ !29 = !DILocation(line: 113, column: 29, scope: !24, inlinedAt: !26)
1154
+ !30 = !DILocation(line: 123, column: 29, scope: !24, inlinedAt: !26)
1155
+ !31 = !DILocation(line: 39, column: 40, scope: !5)
1156
+ !32 = !DILocation(line: 41, column: 19, scope: !5)
1157
+ !33 = !DILocation(line: 42, column: 18, scope: !5)
1158
+ !34 = !DILocation(line: 44, column: 33, scope: !5)
1159
+ !35 = !DILocation(line: 45, column: 27, scope: !5)
1160
+ !36 = !DILocation(line: 46, column: 19, scope: !5)
1161
+ !37 = !DILocation(line: 291, column: 36, scope: !38, inlinedAt: !40)
1162
+ !38 = distinct !DILexicalBlockFile(scope: !5, file: !39, discriminator: 0)
1163
+ !39 = !DIFile(filename: "standard.py", directory: "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language")
1164
+ !40 = !DILocation(line: 49, column: 26, scope: !5)
1165
+ !41 = !DILocation(line: 261, column: 15, scope: !38, inlinedAt: !40)
1166
+ !42 = !DILocation(line: 49, column: 29, scope: !5)
1167
+ !43 = !DILocation(line: 51, column: 34, scope: !5)
1168
+ !44 = !DILocation(line: 52, column: 27, scope: !5)
1169
+ !45 = !DILocation(line: 72, column: 16, scope: !24, inlinedAt: !46)
1170
+ !46 = !DILocation(line: 34, column: 51, scope: !5)
1171
+ !47 = !DILocation(line: 75, column: 25, scope: !24, inlinedAt: !46)
1172
+ !48 = !DILocation(line: 75, column: 36, scope: !24, inlinedAt: !46)
1173
+ !49 = !DILocation(line: 75, column: 32, scope: !24, inlinedAt: !46)
1174
+ !50 = !DILocation(line: 74, column: 34, scope: !24, inlinedAt: !46)
1175
+ !51 = !DILocation(line: 75, column: 47, scope: !24, inlinedAt: !46)
1176
+ !52 = !DILocation(line: 53, column: 20, scope: !5)
1177
+ !53 = !DILocation(line: 55, column: 20, scope: !5)
1178
+ !54 = !DILocation(line: 56, column: 49, scope: !5)
1179
+ !55 = !DILocation(line: 56, column: 75, scope: !5)
1180
+ !56 = !DILocation(line: 56, column: 66, scope: !5)
1181
+ !57 = !DILocation(line: 56, scope: !5)
1182
+ !58 = !DILocation(line: 56, column: 57, scope: !5)
1183
+ !59 = !DILocation(line: 56, column: 34, scope: !5)
1184
+ !60 = !DILocation(line: 56, column: 25, scope: !5)
1185
+ !61 = !DILocation(line: 56, column: 89, scope: !5)
1186
+ !62 = !DILocation(line: 57, column: 25, scope: !5)
1187
+ !63 = !DILocation(line: 57, column: 36, scope: !5)
1188
+ !64 = !DILocation(line: 58, column: 25, scope: !5)
1189
+ !65 = !DILocation(line: 58, column: 37, scope: !5)
1190
+ !66 = !DILocation(line: 58, column: 4, scope: !5)
progress/SpecForge/cache/compiled_kernels/triton/3/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.ptx ADDED
@@ -0,0 +1,1141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.7
6
+ .target sm_90a
7
+ .address_size 64
8
+
9
+ // .globl triton_per_fused_mul_1 // -- Begin function triton_per_fused_mul_1
10
+ .extern .shared .align 16 .b8 global_smem[];
11
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
12
+ // @triton_per_fused_mul_1
13
+ .visible .entry triton_per_fused_mul_1(
14
+ .param .u64 .ptr .global .align 1 triton_per_fused_mul_1_param_0,
15
+ .param .u64 .ptr .global .align 1 triton_per_fused_mul_1_param_1,
16
+ .param .u64 .ptr .global .align 1 triton_per_fused_mul_1_param_2,
17
+ .param .u64 .ptr .global .align 1 triton_per_fused_mul_1_param_3,
18
+ .param .u64 .ptr .global .align 1 triton_per_fused_mul_1_param_4,
19
+ .param .u64 triton_per_fused_mul_1_param_5,
20
+ .param .u32 triton_per_fused_mul_1_param_6,
21
+ .param .u32 triton_per_fused_mul_1_param_7,
22
+ .param .u64 .ptr .global .align 1 triton_per_fused_mul_1_param_8,
23
+ .param .u64 .ptr .global .align 1 triton_per_fused_mul_1_param_9
24
+ )
25
+ .reqntid 256
26
+ {
27
+ .reg .pred %p<110>;
28
+ .reg .b32 %r<491>;
29
+ .reg .b64 %rd<57>;
30
+ .loc 1 18 0 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:18:0
31
+ $L__func_begin0:
32
+ .loc 1 18 0 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:18:0
33
+
34
+ // %bb.0: // %__nv_exp2f.exit
35
+ ld.param.b32 %r26, [triton_per_fused_mul_1_param_6];
36
+ ld.param.b64 %rd13, [triton_per_fused_mul_1_param_5];
37
+ ld.param.b64 %rd22, [triton_per_fused_mul_1_param_0];
38
+ ld.param.b64 %rd23, [triton_per_fused_mul_1_param_1];
39
+ $L__tmp0:
40
+ .loc 1 23 28 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:23:28
41
+ mov.u32 %r143, %ctaid.x;
42
+ .loc 1 23 33 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:23:33
43
+ shl.b32 %r1, %r143, 7;
44
+ .loc 1 24 44 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:24:44
45
+ mov.u32 %r2, %tid.x;
46
+ shr.u32 %r144, %r2, 5;
47
+ and.b32 %r145, %r2, 31;
48
+ shl.b32 %r146, %r145, 2;
49
+ .loc 1 24 23 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:24:23
50
+ or.b32 %r147, %r146, %r1;
51
+ .loc 1 25 21 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:25:21
52
+ setp.lt.s32 %p1, %r147, %r26;
53
+ .loc 1 26 38 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:26:38
54
+ bfe.u32 %r148, %r2, 5, 3;
55
+ or.b32 %r149, %r148, 8;
56
+ or.b32 %r150, %r148, 16;
57
+ or.b32 %r151, %r144, 24;
58
+ .loc 1 33 19 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:33:19
59
+ cvt.s64.s32 %rd24, %r147;
60
+ .loc 1 35 38 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:35:38
61
+ shl.b64 %rd25, %rd13, 5;
62
+ .loc 1 35 42 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:35:42
63
+ cvt.u64.u32 %rd26, %r148;
64
+ cvt.u64.u32 %rd27, %r149;
65
+ cvt.u64.u32 %rd28, %r150;
66
+ cvt.u64.u32 %rd29, %r151;
67
+ .loc 1 35 35 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:35:35
68
+ mad.lo.s64 %rd30, %rd25, %rd26, %rd24;
69
+ mad.lo.s64 %rd31, %rd25, %rd27, %rd24;
70
+ mad.lo.s64 %rd32, %rd25, %rd28, %rd24;
71
+ mad.lo.s64 %rd33, %rd25, %rd29, %rd24;
72
+ .loc 1 35 30 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:35:30
73
+ shl.b64 %rd34, %rd30, 2;
74
+ add.s64 %rd14, %rd22, %rd34;
75
+ shl.b64 %rd35, %rd31, 2;
76
+ add.s64 %rd15, %rd22, %rd35;
77
+ shl.b64 %rd36, %rd32, 2;
78
+ add.s64 %rd16, %rd22, %rd36;
79
+ shl.b64 %rd37, %rd33, 2;
80
+ add.s64 %rd17, %rd22, %rd37;
81
+ mov.b32 %r31, 0;
82
+ .loc 1 35 49 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:35:49
83
+ // begin inline asm
84
+ mov.u32 %r27, %r31;
85
+ mov.u32 %r28, %r31;
86
+ mov.u32 %r29, %r31;
87
+ mov.u32 %r30, %r31;
88
+ @%p1 ld.global.v4.b32 { %r27, %r28, %r29, %r30 }, [ %rd14 + 0 ];
89
+ // end inline asm
90
+ // begin inline asm
91
+ mov.u32 %r35, %r31;
92
+ mov.u32 %r36, %r31;
93
+ mov.u32 %r37, %r31;
94
+ mov.u32 %r38, %r31;
95
+ @%p1 ld.global.v4.b32 { %r35, %r36, %r37, %r38 }, [ %rd15 + 0 ];
96
+ // end inline asm
97
+ // begin inline asm
98
+ mov.u32 %r43, %r31;
99
+ mov.u32 %r44, %r31;
100
+ mov.u32 %r45, %r31;
101
+ mov.u32 %r46, %r31;
102
+ @%p1 ld.global.v4.b32 { %r43, %r44, %r45, %r46 }, [ %rd16 + 0 ];
103
+ // end inline asm
104
+ // begin inline asm
105
+ mov.u32 %r51, %r31;
106
+ mov.u32 %r52, %r31;
107
+ mov.u32 %r53, %r31;
108
+ mov.u32 %r54, %r31;
109
+ @%p1 ld.global.v4.b32 { %r51, %r52, %r53, %r54 }, [ %rd17 + 0 ];
110
+ // end inline asm
111
+ .loc 1 36 30 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:36:30
112
+ add.s64 %rd18, %rd23, %rd34;
113
+ add.s64 %rd19, %rd23, %rd35;
114
+ add.s64 %rd20, %rd23, %rd36;
115
+ add.s64 %rd21, %rd23, %rd37;
116
+ .loc 1 36 49 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:36:49
117
+ // begin inline asm
118
+ mov.u32 %r59, %r31;
119
+ mov.u32 %r60, %r31;
120
+ mov.u32 %r61, %r31;
121
+ mov.u32 %r62, %r31;
122
+ @%p1 ld.global.v4.b32 { %r59, %r60, %r61, %r62 }, [ %rd18 + 0 ];
123
+ // end inline asm
124
+ // begin inline asm
125
+ mov.u32 %r67, %r31;
126
+ mov.u32 %r68, %r31;
127
+ mov.u32 %r69, %r31;
128
+ mov.u32 %r70, %r31;
129
+ @%p1 ld.global.v4.b32 { %r67, %r68, %r69, %r70 }, [ %rd19 + 0 ];
130
+ // end inline asm
131
+ // begin inline asm
132
+ mov.u32 %r75, %r31;
133
+ mov.u32 %r76, %r31;
134
+ mov.u32 %r77, %r31;
135
+ mov.u32 %r78, %r31;
136
+ @%p1 ld.global.v4.b32 { %r75, %r76, %r77, %r78 }, [ %rd20 + 0 ];
137
+ // end inline asm
138
+ // begin inline asm
139
+ mov.u32 %r83, %r31;
140
+ mov.u32 %r84, %r31;
141
+ mov.u32 %r85, %r31;
142
+ mov.u32 %r86, %r31;
143
+ @%p1 ld.global.v4.b32 { %r83, %r84, %r85, %r86 }, [ %rd21 + 0 ];
144
+ // end inline asm
145
+ .loc 1 38 33 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:38:33
146
+ selp.f32 %r152, %r27, 0fFF800000, %p1;
147
+ selp.f32 %r153, %r28, 0fFF800000, %p1;
148
+ selp.f32 %r154, %r29, 0fFF800000, %p1;
149
+ selp.f32 %r155, %r30, 0fFF800000, %p1;
150
+ selp.f32 %r156, %r35, 0fFF800000, %p1;
151
+ selp.f32 %r157, %r36, 0fFF800000, %p1;
152
+ selp.f32 %r158, %r37, 0fFF800000, %p1;
153
+ selp.f32 %r159, %r38, 0fFF800000, %p1;
154
+ selp.f32 %r160, %r43, 0fFF800000, %p1;
155
+ selp.f32 %r161, %r44, 0fFF800000, %p1;
156
+ selp.f32 %r162, %r45, 0fFF800000, %p1;
157
+ selp.f32 %r163, %r46, 0fFF800000, %p1;
158
+ selp.f32 %r164, %r51, 0fFF800000, %p1;
159
+ selp.f32 %r165, %r52, 0fFF800000, %p1;
160
+ selp.f32 %r166, %r53, 0fFF800000, %p1;
161
+ selp.f32 %r167, %r54, 0fFF800000, %p1;
162
+ $L__tmp1:
163
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
164
+ setp.gt.f32 %p33, %r152, %r156;
165
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
166
+ setp.nan.f32 %p34, %r152, %r152;
167
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
168
+ selp.f32 %r168, %r152, %r156, %p33;
169
+ selp.f32 %r169, %r152, %r168, %p34;
170
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
171
+ setp.gt.f32 %p35, %r153, %r157;
172
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
173
+ setp.nan.f32 %p36, %r153, %r153;
174
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
175
+ selp.f32 %r170, %r153, %r157, %p35;
176
+ selp.f32 %r171, %r153, %r170, %p36;
177
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
178
+ setp.gt.f32 %p37, %r154, %r158;
179
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
180
+ setp.nan.f32 %p38, %r154, %r154;
181
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
182
+ selp.f32 %r172, %r154, %r158, %p37;
183
+ selp.f32 %r173, %r154, %r172, %p38;
184
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
185
+ setp.gt.f32 %p39, %r155, %r159;
186
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
187
+ setp.nan.f32 %p40, %r155, %r155;
188
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
189
+ selp.f32 %r174, %r155, %r159, %p39;
190
+ selp.f32 %r175, %r155, %r174, %p40;
191
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
192
+ setp.gt.f32 %p41, %r169, %r160;
193
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
194
+ setp.nan.f32 %p42, %r169, %r169;
195
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
196
+ selp.f32 %r176, %r169, %r160, %p42;
197
+ selp.f32 %r177, %r169, %r176, %p41;
198
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
199
+ setp.gt.f32 %p43, %r171, %r161;
200
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
201
+ setp.nan.f32 %p44, %r171, %r171;
202
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
203
+ selp.f32 %r178, %r171, %r161, %p44;
204
+ selp.f32 %r179, %r171, %r178, %p43;
205
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
206
+ setp.gt.f32 %p45, %r173, %r162;
207
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
208
+ setp.nan.f32 %p46, %r173, %r173;
209
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
210
+ selp.f32 %r180, %r173, %r162, %p46;
211
+ selp.f32 %r181, %r173, %r180, %p45;
212
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
213
+ setp.gt.f32 %p47, %r175, %r163;
214
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
215
+ setp.nan.f32 %p48, %r175, %r175;
216
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
217
+ selp.f32 %r182, %r175, %r163, %p48;
218
+ selp.f32 %r183, %r175, %r182, %p47;
219
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
220
+ setp.gt.f32 %p49, %r177, %r164;
221
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
222
+ setp.nan.f32 %p50, %r177, %r177;
223
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
224
+ selp.f32 %r184, %r177, %r164, %p50;
225
+ selp.f32 %r92, %r177, %r184, %p49;
226
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
227
+ setp.gt.f32 %p51, %r179, %r165;
228
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
229
+ setp.nan.f32 %p52, %r179, %r179;
230
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
231
+ selp.f32 %r185, %r179, %r165, %p52;
232
+ selp.f32 %r94, %r179, %r185, %p51;
233
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
234
+ setp.gt.f32 %p53, %r181, %r166;
235
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
236
+ setp.nan.f32 %p54, %r181, %r181;
237
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
238
+ selp.f32 %r186, %r181, %r166, %p54;
239
+ selp.f32 %r96, %r181, %r186, %p53;
240
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
241
+ setp.gt.f32 %p55, %r183, %r167;
242
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
243
+ setp.nan.f32 %p56, %r183, %r183;
244
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
245
+ selp.f32 %r187, %r183, %r167, %p56;
246
+ selp.f32 %r98, %r183, %r187, %p55;
247
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
248
+ shl.b32 %r188, %r145, 7;
249
+ mov.b32 %r189, global_smem;
250
+ add.s32 %r190, %r189, %r188;
251
+ shl.b32 %r191, %r148, 2;
252
+ add.s32 %r91, %r190, %r191;
253
+ mov.pred %p9, -1;
254
+ // begin inline asm
255
+ @%p9 st.shared.b32 [ %r91 + 0 ], %r92;
256
+ // end inline asm
257
+ add.s32 %r93, %r91, 32;
258
+ // begin inline asm
259
+ @%p9 st.shared.b32 [ %r93 + 0 ], %r94;
260
+ // end inline asm
261
+ add.s32 %r95, %r91, 64;
262
+ // begin inline asm
263
+ @%p9 st.shared.b32 [ %r95 + 0 ], %r96;
264
+ // end inline asm
265
+ add.s32 %r97, %r91, 96;
266
+ // begin inline asm
267
+ @%p9 st.shared.b32 [ %r97 + 0 ], %r98;
268
+ // end inline asm
269
+ bar.sync 0;
270
+ shl.b32 %r192, %r2, 2;
271
+ add.s32 %r100, %r189, %r192;
272
+ // begin inline asm
273
+ @%p9 ld.shared.b32 %r99, [ %r100 + 0 ];
274
+ // end inline asm
275
+ shfl.sync.bfly.b32 %r193, %r99, 4, 31, -1;
276
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
277
+ setp.gt.f32 %p57, %r99, %r193;
278
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
279
+ setp.nan.f32 %p58, %r99, %r99;
280
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
281
+ selp.f32 %r194, %r99, %r193, %p57;
282
+ selp.f32 %r195, %r99, %r194, %p58;
283
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
284
+ shfl.sync.bfly.b32 %r196, %r195, 2, 31, -1;
285
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
286
+ setp.gt.f32 %p59, %r195, %r196;
287
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
288
+ setp.nan.f32 %p60, %r195, %r195;
289
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
290
+ selp.f32 %r197, %r195, %r196, %p60;
291
+ selp.f32 %r198, %r195, %r197, %p59;
292
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
293
+ shfl.sync.bfly.b32 %r199, %r198, 1, 31, -1;
294
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
295
+ setp.gt.f32 %p61, %r198, %r199;
296
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
297
+ setp.nan.f32 %p62, %r198, %r198;
298
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
299
+ and.b32 %r200, %r2, 7;
300
+ setp.eq.b32 %p14, %r200, 0;
301
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
302
+ selp.b32 %r201, %r198, %r199, %p62;
303
+ selp.b32 %r102, %r198, %r201, %p61;
304
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
305
+ // begin inline asm
306
+ @%p14 st.shared.b32 [ %r100 + 0 ], %r102;
307
+ // end inline asm
308
+ add.s32 %r104, %r100, 1024;
309
+ // begin inline asm
310
+ @%p9 ld.shared.b32 %r103, [ %r104 + 0 ];
311
+ // end inline asm
312
+ shfl.sync.bfly.b32 %r202, %r103, 4, 31, -1;
313
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
314
+ setp.gt.f32 %p63, %r103, %r202;
315
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
316
+ setp.nan.f32 %p64, %r103, %r103;
317
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
318
+ selp.f32 %r203, %r103, %r202, %p63;
319
+ selp.f32 %r204, %r103, %r203, %p64;
320
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
321
+ shfl.sync.bfly.b32 %r205, %r204, 2, 31, -1;
322
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
323
+ setp.gt.f32 %p65, %r204, %r205;
324
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
325
+ setp.nan.f32 %p66, %r204, %r204;
326
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
327
+ selp.f32 %r206, %r204, %r205, %p66;
328
+ selp.f32 %r207, %r204, %r206, %p65;
329
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
330
+ shfl.sync.bfly.b32 %r208, %r207, 1, 31, -1;
331
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
332
+ setp.gt.f32 %p67, %r207, %r208;
333
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
334
+ setp.nan.f32 %p68, %r207, %r207;
335
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
336
+ selp.b32 %r209, %r207, %r208, %p68;
337
+ selp.b32 %r106, %r207, %r209, %p67;
338
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
339
+ // begin inline asm
340
+ @%p14 st.shared.b32 [ %r104 + 0 ], %r106;
341
+ // end inline asm
342
+ add.s32 %r108, %r100, 2048;
343
+ // begin inline asm
344
+ @%p9 ld.shared.b32 %r107, [ %r108 + 0 ];
345
+ // end inline asm
346
+ shfl.sync.bfly.b32 %r210, %r107, 4, 31, -1;
347
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
348
+ setp.gt.f32 %p69, %r107, %r210;
349
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
350
+ setp.nan.f32 %p70, %r107, %r107;
351
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
352
+ selp.f32 %r211, %r107, %r210, %p69;
353
+ selp.f32 %r212, %r107, %r211, %p70;
354
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
355
+ shfl.sync.bfly.b32 %r213, %r212, 2, 31, -1;
356
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
357
+ setp.gt.f32 %p71, %r212, %r213;
358
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
359
+ setp.nan.f32 %p72, %r212, %r212;
360
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
361
+ selp.f32 %r214, %r212, %r213, %p72;
362
+ selp.f32 %r215, %r212, %r214, %p71;
363
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
364
+ shfl.sync.bfly.b32 %r216, %r215, 1, 31, -1;
365
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
366
+ setp.gt.f32 %p73, %r215, %r216;
367
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
368
+ setp.nan.f32 %p74, %r215, %r215;
369
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
370
+ selp.b32 %r217, %r215, %r216, %p74;
371
+ selp.b32 %r110, %r215, %r217, %p73;
372
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
373
+ // begin inline asm
374
+ @%p14 st.shared.b32 [ %r108 + 0 ], %r110;
375
+ // end inline asm
376
+ add.s32 %r112, %r100, 3072;
377
+ // begin inline asm
378
+ @%p9 ld.shared.b32 %r111, [ %r112 + 0 ];
379
+ // end inline asm
380
+ shfl.sync.bfly.b32 %r218, %r111, 4, 31, -1;
381
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
382
+ setp.gt.f32 %p75, %r111, %r218;
383
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
384
+ setp.nan.f32 %p76, %r111, %r111;
385
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
386
+ selp.f32 %r219, %r111, %r218, %p75;
387
+ selp.f32 %r220, %r111, %r219, %p76;
388
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
389
+ shfl.sync.bfly.b32 %r221, %r220, 2, 31, -1;
390
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
391
+ setp.gt.f32 %p77, %r220, %r221;
392
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
393
+ setp.nan.f32 %p78, %r220, %r220;
394
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
395
+ selp.f32 %r222, %r220, %r221, %p78;
396
+ selp.f32 %r223, %r220, %r222, %p77;
397
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
398
+ shfl.sync.bfly.b32 %r224, %r223, 1, 31, -1;
399
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
400
+ setp.gt.f32 %p79, %r223, %r224;
401
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
402
+ setp.nan.f32 %p80, %r223, %r223;
403
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
404
+ selp.b32 %r225, %r223, %r224, %p80;
405
+ selp.b32 %r114, %r223, %r225, %p79;
406
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:37 ]
407
+ // begin inline asm
408
+ @%p14 st.shared.b32 [ %r112 + 0 ], %r114;
409
+ // end inline asm
410
+ bar.sync 0;
411
+ ld.shared.b32 %r226, [%r190];
412
+ ld.shared.b32 %r227, [%r190+32];
413
+ ld.shared.b32 %r228, [%r190+64];
414
+ ld.shared.b32 %r229, [%r190+96];
415
+ $L__tmp2:
416
+ .loc 1 39 40 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:39:40
417
+ bar.sync 0;
418
+ mad.lo.s32 %r3, %r145, -112, %r190;
419
+ mov.b64 %rd2, {%r228, %r229};
420
+ mov.b64 %rd1, {%r226, %r227};
421
+ st.shared.v2.b64 [%r3], {%rd1, %rd2};
422
+ bar.sync 0;
423
+ shl.b32 %r230, %r144, 7;
424
+ shl.b32 %r231, %r2, 4;
425
+ or.b32 %r232, %r230, %r231;
426
+ and.b32 %r233, %r232, 496;
427
+ add.s32 %r461, %r189, %r233;
428
+ // begin inline asm
429
+ ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%r463}, [%r461];
430
+ // end inline asm
431
+ .loc 1 41 19 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:41:19
432
+ setp.eq.f32 %p81, %r226, 0fFF800000;
433
+ setp.eq.f32 %p82, %r227, 0fFF800000;
434
+ setp.eq.f32 %p83, %r228, 0fFF800000;
435
+ setp.eq.f32 %p84, %r229, 0fFF800000;
436
+ .loc 1 42 18 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:42:18
437
+ sub.f32 %r234, %r27, %r226;
438
+ sub.f32 %r235, %r28, %r227;
439
+ sub.f32 %r236, %r29, %r228;
440
+ sub.f32 %r237, %r30, %r229;
441
+ sub.f32 %r238, %r35, %r226;
442
+ sub.f32 %r239, %r36, %r227;
443
+ sub.f32 %r240, %r37, %r228;
444
+ sub.f32 %r241, %r38, %r229;
445
+ sub.f32 %r242, %r43, %r226;
446
+ sub.f32 %r243, %r44, %r227;
447
+ sub.f32 %r244, %r45, %r228;
448
+ sub.f32 %r245, %r46, %r229;
449
+ sub.f32 %r246, %r51, %r226;
450
+ sub.f32 %r247, %r52, %r227;
451
+ sub.f32 %r248, %r53, %r228;
452
+ sub.f32 %r249, %r54, %r229;
453
+ .loc 1 44 33 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:44:33
454
+ selp.f32 %r250, 0f00000000, %r234, %p81;
455
+ selp.f32 %r251, 0f00000000, %r235, %p82;
456
+ selp.f32 %r252, 0f00000000, %r236, %p83;
457
+ selp.f32 %r253, 0f00000000, %r237, %p84;
458
+ selp.f32 %r254, 0f00000000, %r238, %p81;
459
+ selp.f32 %r255, 0f00000000, %r239, %p82;
460
+ selp.f32 %r256, 0f00000000, %r240, %p83;
461
+ selp.f32 %r257, 0f00000000, %r241, %p84;
462
+ selp.f32 %r258, 0f00000000, %r242, %p81;
463
+ selp.f32 %r259, 0f00000000, %r243, %p82;
464
+ selp.f32 %r260, 0f00000000, %r244, %p83;
465
+ selp.f32 %r261, 0f00000000, %r245, %p84;
466
+ selp.f32 %r262, 0f00000000, %r246, %p81;
467
+ selp.f32 %r263, 0f00000000, %r247, %p82;
468
+ selp.f32 %r264, 0f00000000, %r248, %p83;
469
+ selp.f32 %r265, 0f00000000, %r249, %p84;
470
+ .loc 1 45 27 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:45:27
471
+ ex2.approx.ftz.f32 %r266, %r250;
472
+ ex2.approx.ftz.f32 %r267, %r251;
473
+ ex2.approx.ftz.f32 %r268, %r252;
474
+ ex2.approx.ftz.f32 %r269, %r253;
475
+ ex2.approx.ftz.f32 %r270, %r254;
476
+ ex2.approx.ftz.f32 %r271, %r255;
477
+ ex2.approx.ftz.f32 %r272, %r256;
478
+ ex2.approx.ftz.f32 %r273, %r257;
479
+ ex2.approx.ftz.f32 %r274, %r258;
480
+ ex2.approx.ftz.f32 %r275, %r259;
481
+ ex2.approx.ftz.f32 %r276, %r260;
482
+ ex2.approx.ftz.f32 %r277, %r261;
483
+ ex2.approx.ftz.f32 %r278, %r262;
484
+ ex2.approx.ftz.f32 %r279, %r263;
485
+ ex2.approx.ftz.f32 %r280, %r264;
486
+ ex2.approx.ftz.f32 %r281, %r265;
487
+ .loc 1 46 19 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:46:19
488
+ mul.f32 %r282, %r270, %r67;
489
+ mul.f32 %r283, %r271, %r68;
490
+ mul.f32 %r284, %r272, %r69;
491
+ mul.f32 %r285, %r273, %r70;
492
+ $L__tmp3:
493
+ .loc 3 291 36 // standard.py:291:36 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
494
+ bar.sync 0;
495
+ .loc 3 261 15 // standard.py:261:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
496
+ fma.rn.f32 %r286, %r266, %r59, %r282;
497
+ fma.rn.f32 %r287, %r267, %r60, %r283;
498
+ fma.rn.f32 %r288, %r268, %r61, %r284;
499
+ fma.rn.f32 %r289, %r269, %r62, %r285;
500
+ fma.rn.f32 %r290, %r274, %r75, %r286;
501
+ fma.rn.f32 %r291, %r275, %r76, %r287;
502
+ fma.rn.f32 %r292, %r276, %r77, %r288;
503
+ fma.rn.f32 %r293, %r277, %r78, %r289;
504
+ fma.rn.f32 %r294, %r278, %r83, %r290;
505
+ fma.rn.f32 %r295, %r279, %r84, %r291;
506
+ fma.rn.f32 %r296, %r280, %r85, %r292;
507
+ fma.rn.f32 %r297, %r281, %r86, %r293;
508
+ selp.b32 %r118, %r294, 0, %p1;
509
+ .loc 3 291 36 // standard.py:291:36 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
510
+ // begin inline asm
511
+ @%p9 st.shared.b32 [ %r91 + 0 ], %r118;
512
+ // end inline asm
513
+ .loc 3 261 15 // standard.py:261:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
514
+ selp.b32 %r120, %r295, 0, %p1;
515
+ .loc 3 291 36 // standard.py:291:36 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
516
+ // begin inline asm
517
+ @%p9 st.shared.b32 [ %r93 + 0 ], %r120;
518
+ // end inline asm
519
+ .loc 3 261 15 // standard.py:261:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
520
+ selp.b32 %r122, %r296, 0, %p1;
521
+ .loc 3 291 36 // standard.py:291:36 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
522
+ // begin inline asm
523
+ @%p9 st.shared.b32 [ %r95 + 0 ], %r122;
524
+ // end inline asm
525
+ .loc 3 261 15 // standard.py:261:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
526
+ selp.b32 %r124, %r297, 0, %p1;
527
+ .loc 3 291 36 // standard.py:291:36 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
528
+ // begin inline asm
529
+ @%p9 st.shared.b32 [ %r97 + 0 ], %r124;
530
+ // end inline asm
531
+ bar.sync 0;
532
+ // begin inline asm
533
+ @%p9 ld.shared.b32 %r125, [ %r100 + 0 ];
534
+ // end inline asm
535
+ shfl.sync.bfly.b32 %r298, %r125, 4, 31, -1;
536
+ .loc 3 261 15 // standard.py:261:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
537
+ add.f32 %r299, %r125, %r298;
538
+ .loc 3 291 36 // standard.py:291:36 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
539
+ shfl.sync.bfly.b32 %r300, %r299, 2, 31, -1;
540
+ .loc 3 261 15 // standard.py:261:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
541
+ add.f32 %r301, %r299, %r300;
542
+ .loc 3 291 36 // standard.py:291:36 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
543
+ shfl.sync.bfly.b32 %r302, %r301, 1, 31, -1;
544
+ .loc 3 261 15 // standard.py:261:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
545
+ add.f32 %r128, %r301, %r302;
546
+ .loc 3 291 36 // standard.py:291:36 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
547
+ // begin inline asm
548
+ @%p14 st.shared.b32 [ %r100 + 0 ], %r128;
549
+ // end inline asm
550
+ // begin inline asm
551
+ @%p9 ld.shared.b32 %r129, [ %r104 + 0 ];
552
+ // end inline asm
553
+ shfl.sync.bfly.b32 %r303, %r129, 4, 31, -1;
554
+ .loc 3 261 15 // standard.py:261:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
555
+ add.f32 %r304, %r129, %r303;
556
+ .loc 3 291 36 // standard.py:291:36 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
557
+ shfl.sync.bfly.b32 %r305, %r304, 2, 31, -1;
558
+ .loc 3 261 15 // standard.py:261:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
559
+ add.f32 %r306, %r304, %r305;
560
+ .loc 3 291 36 // standard.py:291:36 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
561
+ shfl.sync.bfly.b32 %r307, %r306, 1, 31, -1;
562
+ .loc 3 261 15 // standard.py:261:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
563
+ add.f32 %r132, %r306, %r307;
564
+ .loc 3 291 36 // standard.py:291:36 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
565
+ // begin inline asm
566
+ @%p14 st.shared.b32 [ %r104 + 0 ], %r132;
567
+ // end inline asm
568
+ // begin inline asm
569
+ @%p9 ld.shared.b32 %r133, [ %r108 + 0 ];
570
+ // end inline asm
571
+ shfl.sync.bfly.b32 %r308, %r133, 4, 31, -1;
572
+ .loc 3 261 15 // standard.py:261:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
573
+ add.f32 %r309, %r133, %r308;
574
+ .loc 3 291 36 // standard.py:291:36 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
575
+ shfl.sync.bfly.b32 %r310, %r309, 2, 31, -1;
576
+ .loc 3 261 15 // standard.py:261:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
577
+ add.f32 %r311, %r309, %r310;
578
+ .loc 3 291 36 // standard.py:291:36 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
579
+ shfl.sync.bfly.b32 %r312, %r311, 1, 31, -1;
580
+ .loc 3 261 15 // standard.py:261:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
581
+ add.f32 %r136, %r311, %r312;
582
+ .loc 3 291 36 // standard.py:291:36 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
583
+ // begin inline asm
584
+ @%p14 st.shared.b32 [ %r108 + 0 ], %r136;
585
+ // end inline asm
586
+ // begin inline asm
587
+ @%p9 ld.shared.b32 %r137, [ %r112 + 0 ];
588
+ // end inline asm
589
+ shfl.sync.bfly.b32 %r313, %r137, 4, 31, -1;
590
+ .loc 3 261 15 // standard.py:261:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
591
+ add.f32 %r314, %r137, %r313;
592
+ .loc 3 291 36 // standard.py:291:36 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
593
+ shfl.sync.bfly.b32 %r315, %r314, 2, 31, -1;
594
+ .loc 3 261 15 // standard.py:261:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
595
+ add.f32 %r316, %r314, %r315;
596
+ .loc 3 291 36 // standard.py:291:36 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
597
+ shfl.sync.bfly.b32 %r317, %r316, 1, 31, -1;
598
+ .loc 3 261 15 // standard.py:261:15 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
599
+ add.f32 %r140, %r316, %r317;
600
+ .loc 3 291 36 // standard.py:291:36 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:26 ]
601
+ // begin inline asm
602
+ @%p14 st.shared.b32 [ %r112 + 0 ], %r140;
603
+ // end inline asm
604
+ bar.sync 0;
605
+ ld.shared.b32 %r318, [%r190];
606
+ ld.shared.b32 %r319, [%r190+32];
607
+ ld.shared.b32 %r320, [%r190+64];
608
+ ld.shared.b32 %r321, [%r190+96];
609
+ $L__tmp4:
610
+ .loc 1 49 29 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:49:29
611
+ bar.sync 0;
612
+ st.shared.v4.b32 [%r3], {%r318, %r319, %r320, %r321};
613
+ bar.sync 0;
614
+ // begin inline asm
615
+ ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%r464}, [%r461];
616
+ // end inline asm
617
+ .loc 1 51 34 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:51:34
618
+ selp.f32 %r322, 0f3F800000, %r318, %p81;
619
+ selp.f32 %r7, 0f3F800000, %r319, %p82;
620
+ .loc 1 52 27 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:52:27
621
+ setp.lt.f32 %p85, %r322, 0f00800000;
622
+ mul.f32 %r323, %r322, 0f4B000000;
623
+ selp.f32 %r10, %r323, %r322, %p85;
624
+ selp.f32 %r324, 0fC1B80000, 0f00000000, %p85;
625
+ add.s32 %r325, %r10, -1060439283;
626
+ and.b32 %r326, %r325, -8388608;
627
+ sub.s32 %r327, %r10, %r326;
628
+ cvt.rn.f32.s32 %r328, %r326;
629
+ mov.b32 %r329, 0f34000000;
630
+ fma.rn.ftz.f32 %r330, %r328, %r329, %r324;
631
+ add.f32 %r331, %r327, 0fBF800000;
632
+ mov.b32 %r332, 0fBE2C7F30;
633
+ mov.b32 %r333, 0f3DC6B27F;
634
+ fma.rn.ftz.f32 %r334, %r333, %r331, %r332;
635
+ mov.b32 %r335, 0f3E2FCF2A;
636
+ fma.rn.ftz.f32 %r336, %r334, %r331, %r335;
637
+ mov.b32 %r337, 0fBE374E43;
638
+ fma.rn.ftz.f32 %r338, %r336, %r331, %r337;
639
+ mov.b32 %r339, 0f3E520BF4;
640
+ fma.rn.ftz.f32 %r340, %r338, %r331, %r339;
641
+ mov.b32 %r341, 0fBE763C8B;
642
+ fma.rn.ftz.f32 %r342, %r340, %r331, %r341;
643
+ mov.b32 %r343, 0f3E93BF99;
644
+ fma.rn.ftz.f32 %r344, %r342, %r331, %r343;
645
+ mov.b32 %r345, 0fBEB8AA49;
646
+ fma.rn.ftz.f32 %r346, %r344, %r331, %r345;
647
+ mov.b32 %r347, 0f3EF6384A;
648
+ fma.rn.ftz.f32 %r348, %r346, %r331, %r347;
649
+ mov.b32 %r349, 0fBF38AA3B;
650
+ fma.rn.ftz.f32 %r350, %r348, %r331, %r349;
651
+ mul.f32 %r351, %r331, %r350;
652
+ mul.f32 %r352, %r331, %r351;
653
+ mov.b32 %r353, 0f3FB8AA3B;
654
+ fma.rn.ftz.f32 %r354, %r331, %r353, %r352;
655
+ add.f32 %r487, %r330, %r354;
656
+ setp.lt.u32 %p86, %r10, 2139095040;
657
+ mov.b32 %r355, 0f7F800000;
658
+ @%p86 bra $L__BB0_2;
659
+ // %bb.1: // %__nv_fmaf_rn.exit.i.i
660
+ .loc 1 0 27 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:0:27
661
+ fma.rn.ftz.f32 %r487, %r10, %r355, %r355;
662
+ $L__BB0_2: // %__nv_log2f.exit
663
+ selp.f32 %r8, 0f3F800000, %r320, %p83;
664
+ .loc 1 52 27 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:52:27
665
+ setp.lt.f32 %p87, %r7, 0f00800000;
666
+ mul.f32 %r356, %r7, 0f4B000000;
667
+ selp.f32 %r14, %r356, %r7, %p87;
668
+ selp.f32 %r357, 0fC1B80000, 0f00000000, %p87;
669
+ add.s32 %r358, %r14, -1060439283;
670
+ and.b32 %r359, %r358, -8388608;
671
+ sub.s32 %r360, %r14, %r359;
672
+ cvt.rn.f32.s32 %r361, %r359;
673
+ fma.rn.ftz.f32 %r363, %r361, %r329, %r357;
674
+ add.f32 %r364, %r360, 0fBF800000;
675
+ fma.rn.ftz.f32 %r367, %r333, %r364, %r332;
676
+ fma.rn.ftz.f32 %r369, %r367, %r364, %r335;
677
+ fma.rn.ftz.f32 %r371, %r369, %r364, %r337;
678
+ fma.rn.ftz.f32 %r373, %r371, %r364, %r339;
679
+ fma.rn.ftz.f32 %r375, %r373, %r364, %r341;
680
+ fma.rn.ftz.f32 %r377, %r375, %r364, %r343;
681
+ fma.rn.ftz.f32 %r379, %r377, %r364, %r345;
682
+ fma.rn.ftz.f32 %r381, %r379, %r364, %r347;
683
+ fma.rn.ftz.f32 %r383, %r381, %r364, %r349;
684
+ mul.f32 %r384, %r364, %r383;
685
+ mul.f32 %r385, %r364, %r384;
686
+ fma.rn.ftz.f32 %r387, %r364, %r353, %r385;
687
+ add.f32 %r488, %r363, %r387;
688
+ setp.lt.u32 %p88, %r14, 2139095040;
689
+ @%p88 bra $L__BB0_4;
690
+ // %bb.3: // %__nv_fmaf_rn.exit.i.i74
691
+ .loc 1 0 27 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:0:27
692
+ fma.rn.ftz.f32 %r488, %r14, %r355, %r355;
693
+ $L__BB0_4: // %__nv_log2f.exit77
694
+ selp.f32 %r9, 0f3F800000, %r321, %p84;
695
+ .loc 1 52 27 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:52:27
696
+ setp.lt.f32 %p89, %r8, 0f00800000;
697
+ mul.f32 %r389, %r8, 0f4B000000;
698
+ selp.f32 %r18, %r389, %r8, %p89;
699
+ selp.f32 %r390, 0fC1B80000, 0f00000000, %p89;
700
+ add.s32 %r391, %r18, -1060439283;
701
+ and.b32 %r392, %r391, -8388608;
702
+ sub.s32 %r393, %r18, %r392;
703
+ cvt.rn.f32.s32 %r394, %r392;
704
+ mov.b32 %r395, 0f34000000;
705
+ fma.rn.ftz.f32 %r396, %r394, %r395, %r390;
706
+ add.f32 %r397, %r393, 0fBF800000;
707
+ mov.b32 %r398, 0fBE2C7F30;
708
+ mov.b32 %r399, 0f3DC6B27F;
709
+ fma.rn.ftz.f32 %r400, %r399, %r397, %r398;
710
+ mov.b32 %r401, 0f3E2FCF2A;
711
+ fma.rn.ftz.f32 %r402, %r400, %r397, %r401;
712
+ mov.b32 %r403, 0fBE374E43;
713
+ fma.rn.ftz.f32 %r404, %r402, %r397, %r403;
714
+ mov.b32 %r405, 0f3E520BF4;
715
+ fma.rn.ftz.f32 %r406, %r404, %r397, %r405;
716
+ mov.b32 %r407, 0fBE763C8B;
717
+ fma.rn.ftz.f32 %r408, %r406, %r397, %r407;
718
+ mov.b32 %r409, 0f3E93BF99;
719
+ fma.rn.ftz.f32 %r410, %r408, %r397, %r409;
720
+ mov.b32 %r411, 0fBEB8AA49;
721
+ fma.rn.ftz.f32 %r412, %r410, %r397, %r411;
722
+ mov.b32 %r413, 0f3EF6384A;
723
+ fma.rn.ftz.f32 %r414, %r412, %r397, %r413;
724
+ mov.b32 %r415, 0fBF38AA3B;
725
+ fma.rn.ftz.f32 %r416, %r414, %r397, %r415;
726
+ mul.f32 %r417, %r397, %r416;
727
+ mul.f32 %r418, %r397, %r417;
728
+ mov.b32 %r419, 0f3FB8AA3B;
729
+ fma.rn.ftz.f32 %r420, %r397, %r419, %r418;
730
+ add.f32 %r489, %r396, %r420;
731
+ setp.lt.u32 %p90, %r18, 2139095040;
732
+ mov.b32 %r421, 0f7F800000;
733
+ @%p90 bra $L__BB0_6;
734
+ // %bb.5: // %__nv_fmaf_rn.exit.i.i104
735
+ .loc 1 0 27 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:0:27
736
+ fma.rn.ftz.f32 %r489, %r18, %r421, %r421;
737
+ $L__BB0_6: // %__nv_log2f.exit107
738
+ .loc 1 52 27 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:52:27
739
+ setp.lt.f32 %p91, %r9, 0f00800000;
740
+ mul.f32 %r422, %r9, 0f4B000000;
741
+ selp.f32 %r22, %r422, %r9, %p91;
742
+ selp.f32 %r423, 0fC1B80000, 0f00000000, %p91;
743
+ add.s32 %r424, %r22, -1060439283;
744
+ and.b32 %r425, %r424, -8388608;
745
+ sub.s32 %r426, %r22, %r425;
746
+ cvt.rn.f32.s32 %r427, %r425;
747
+ fma.rn.ftz.f32 %r429, %r427, %r395, %r423;
748
+ add.f32 %r430, %r426, 0fBF800000;
749
+ fma.rn.ftz.f32 %r433, %r399, %r430, %r398;
750
+ fma.rn.ftz.f32 %r435, %r433, %r430, %r401;
751
+ fma.rn.ftz.f32 %r437, %r435, %r430, %r403;
752
+ fma.rn.ftz.f32 %r439, %r437, %r430, %r405;
753
+ fma.rn.ftz.f32 %r441, %r439, %r430, %r407;
754
+ fma.rn.ftz.f32 %r443, %r441, %r430, %r409;
755
+ fma.rn.ftz.f32 %r445, %r443, %r430, %r411;
756
+ fma.rn.ftz.f32 %r447, %r445, %r430, %r413;
757
+ fma.rn.ftz.f32 %r449, %r447, %r430, %r415;
758
+ mul.f32 %r450, %r430, %r449;
759
+ mul.f32 %r451, %r430, %r450;
760
+ fma.rn.ftz.f32 %r453, %r430, %r419, %r451;
761
+ add.f32 %r490, %r429, %r453;
762
+ setp.lt.u32 %p92, %r22, 2139095040;
763
+ @%p92 bra $L__BB0_8;
764
+ // %bb.7: // %__nv_fmaf_rn.exit.i.i134
765
+ .loc 1 0 27 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:0:27
766
+ fma.rn.ftz.f32 %r490, %r22, %r421, %r421;
767
+ $L__BB0_8: // %__nv_log2f.exit137
768
+ ld.param.b64 %rd12, [triton_per_fused_mul_1_param_4];
769
+ ld.param.b64 %rd11, [triton_per_fused_mul_1_param_3];
770
+ ld.param.b64 %rd10, [triton_per_fused_mul_1_param_2];
771
+ .loc 1 52 27 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:52:27
772
+ mov.b64 %rd4, {%r18, %r22};
773
+ mov.b64 %rd3, {%r10, %r14};
774
+ .loc 1 24 44 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:24:44
775
+ and.b32 %r455, %r2, 127;
776
+ .loc 1 24 23 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:24:23
777
+ or.b32 %r456, %r1, %r455;
778
+ .loc 1 33 19 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:33:19
779
+ cvt.s64.s32 %rd5, %r456;
780
+ $L__tmp5:
781
+ .loc 2 72 16 // triton_helpers.py:72:16 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:34:51 ]
782
+ or.b64 %rd38, %rd5, %rd13;
783
+ and.b64 %rd39, %rd38, -4294967296;
784
+ setp.ne.b64 %p93, %rd39, 0;
785
+ @%p93 bra $L__BB0_10;
786
+ bra.uni $L__BB0_9;
787
+ $L__BB0_10:
788
+ div.s64 %rd56, %rd5, %rd13;
789
+ bra.uni $L__BB0_11;
790
+ $L__BB0_9:
791
+ cvt.u32.u64 %r457, %rd13;
792
+ cvt.u32.u64 %r458, %rd5;
793
+ div.u32 %r459, %r458, %r457;
794
+ cvt.u64.u32 %rd56, %r459;
795
+ $L__tmp6:
796
+ $L__BB0_11:
797
+ .loc 2 0 16 // triton_helpers.py:0:16
798
+ cvt.u32.u64 %r465, %rd5;
799
+ .loc 1 52 27 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:52:27
800
+ mov.b64 {%r466, %r467}, %rd4;
801
+ setp.eq.f32 %p97, %r466, 0f00000000;
802
+ setp.eq.f32 %p98, %r467, 0f00000000;
803
+ mov.b64 {%r468, %r469}, %rd3;
804
+ setp.eq.f32 %p99, %r468, 0f00000000;
805
+ setp.eq.f32 %p100, %r469, 0f00000000;
806
+ $L__tmp7:
807
+ .loc 2 75 25 // triton_helpers.py:75:25 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:34:51 ]
808
+ setp.lt.s32 %p101, %r1, 0;
809
+ .loc 2 75 36 // triton_helpers.py:75:36 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:34:51 ]
810
+ setp.lt.s64 %p102, %rd13, 0;
811
+ .loc 2 75 32 // triton_helpers.py:75:32 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:34:51 ]
812
+ xor.pred %p103, %p101, %p102;
813
+ $L__tmp8:
814
+ .loc 1 33 19 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:33:19
815
+ mul.lo.s64 %rd44, %rd56, %rd13;
816
+ sub.s64 %rd45, %rd5, %rd44;
817
+ $L__tmp9:
818
+ .loc 2 74 34 // triton_helpers.py:74:34 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:34:51 ]
819
+ setp.ne.b64 %p104, %rd45, 0;
820
+ .loc 2 75 47 // triton_helpers.py:75:47 @[ c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:34:51 ]
821
+ and.pred %p105, %p103, %p104;
822
+ selp.b64 %rd46, -1, 0, %p105;
823
+ add.s64 %rd47, %rd56, %rd46;
824
+ $L__tmp10:
825
+ .loc 1 25 21 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:25:21
826
+ setp.lt.s32 %p106, %r465, %r26;
827
+ .loc 1 52 27 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:52:27
828
+ selp.f32 %r470, 0fFF800000, %r488, %p100;
829
+ selp.f32 %r471, 0fFF800000, %r487, %p99;
830
+ selp.f32 %r472, 0fFF800000, %r490, %p98;
831
+ selp.f32 %r473, 0fFF800000, %r489, %p97;
832
+ .loc 1 53 20 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:53:20
833
+ mov.b64 {%r474, %r475}, %rd2;
834
+ add.f32 %r476, %r474, %r473;
835
+ add.f32 %r477, %r475, %r472;
836
+ mov.b64 {%r478, %r479}, %rd1;
837
+ add.f32 %r480, %r478, %r471;
838
+ add.f32 %r481, %r479, %r470;
839
+ .loc 1 55 20 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:55:20
840
+ mul.f32 %r482, %r481, 0f3F317218;
841
+ mul.f32 %r483, %r480, 0f3F317218;
842
+ mul.f32 %r484, %r477, 0f3F317218;
843
+ mul.f32 %r485, %r476, 0f3F317218;
844
+ .loc 1 56 49 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:56:49
845
+ setp.lt.s64 %p107, %rd13, 2;
846
+ .loc 1 56 75 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:56:75
847
+ setp.gt.s64 %p108, %rd13, 1;
848
+ .loc 1 56 66 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:56:66
849
+ selp.b64 %rd48, %rd13, 0, %p108;
850
+ .loc 1 56 0 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:56
851
+ selp.b64 %rd49, 1, 0, %p107;
852
+ .loc 1 56 57 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:56:57
853
+ add.s64 %rd50, %rd48, %rd49;
854
+ .loc 1 56 34 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:56:34
855
+ mul.lo.s64 %rd51, %rd47, %rd50;
856
+ .loc 1 56 25 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:56:25
857
+ shl.b64 %rd52, %rd45, 2;
858
+ add.s64 %rd53, %rd12, %rd52;
859
+ shl.b64 %rd54, %rd51, 2;
860
+ add.s64 %rd40, %rd53, %rd54;
861
+ .loc 1 56 89 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:56:89
862
+ bar.sync 0;
863
+ st.shared.v4.b32 [%r3], {%r483, %r482, %r485, %r484};
864
+ bar.sync 0;
865
+ // begin inline asm
866
+ ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%r462}, [%r461];
867
+ // end inline asm
868
+ and.b32 %r486, %r2, 128;
869
+ setp.eq.b32 %p109, %r486, 0;
870
+ and.pred %p94, %p109, %p106;
871
+ // begin inline asm
872
+ @%p94 st.global.b32 [ %rd40 + 0 ], { %r462 };
873
+ // end inline asm
874
+ .loc 1 57 25 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:57:25
875
+ shl.b64 %rd55, %rd5, 2;
876
+ add.s64 %rd41, %rd10, %rd55;
877
+ .loc 1 57 36 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:57:36
878
+ // begin inline asm
879
+ @%p94 st.global.b32 [ %rd41 + 0 ], { %r463 };
880
+ // end inline asm
881
+ .loc 1 58 25 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:58:25
882
+ add.s64 %rd42, %rd11, %rd55;
883
+ .loc 1 58 37 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:58:37
884
+ // begin inline asm
885
+ @%p94 st.global.b32 [ %rd42 + 0 ], { %r464 };
886
+ // end inline asm
887
+ .loc 1 58 4 // c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py:58:4
888
+ ret;
889
+ $L__tmp11:
890
+ $L__func_end0:
891
+ // -- End function
892
+ }
893
+ .file 1 "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py"
894
+ .file 2 "/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py"
895
+ .file 3 "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py"
896
+ .section .debug_abbrev
897
+ {
898
+ .b8 1 // Abbreviation Code
899
+ .b8 17 // DW_TAG_compile_unit
900
+ .b8 1 // DW_CHILDREN_yes
901
+ .b8 37 // DW_AT_producer
902
+ .b8 8 // DW_FORM_string
903
+ .b8 19 // DW_AT_language
904
+ .b8 5 // DW_FORM_data2
905
+ .b8 3 // DW_AT_name
906
+ .b8 8 // DW_FORM_string
907
+ .b8 16 // DW_AT_stmt_list
908
+ .b8 6 // DW_FORM_data4
909
+ .b8 27 // DW_AT_comp_dir
910
+ .b8 8 // DW_FORM_string
911
+ .b8 0 // EOM(1)
912
+ .b8 0 // EOM(2)
913
+ .b8 2 // Abbreviation Code
914
+ .b8 46 // DW_TAG_subprogram
915
+ .b8 0 // DW_CHILDREN_no
916
+ .b8 3 // DW_AT_name
917
+ .b8 8 // DW_FORM_string
918
+ .b8 32 // DW_AT_inline
919
+ .b8 11 // DW_FORM_data1
920
+ .b8 0 // EOM(1)
921
+ .b8 0 // EOM(2)
922
+ .b8 3 // Abbreviation Code
923
+ .b8 46 // DW_TAG_subprogram
924
+ .b8 1 // DW_CHILDREN_yes
925
+ .b8 17 // DW_AT_low_pc
926
+ .b8 1 // DW_FORM_addr
927
+ .b8 18 // DW_AT_high_pc
928
+ .b8 1 // DW_FORM_addr
929
+ .b8 49 // DW_AT_abstract_origin
930
+ .b8 19 // DW_FORM_ref4
931
+ .b8 0 // EOM(1)
932
+ .b8 0 // EOM(2)
933
+ .b8 4 // Abbreviation Code
934
+ .b8 29 // DW_TAG_inlined_subroutine
935
+ .b8 0 // DW_CHILDREN_no
936
+ .b8 49 // DW_AT_abstract_origin
937
+ .b8 19 // DW_FORM_ref4
938
+ .b8 17 // DW_AT_low_pc
939
+ .b8 1 // DW_FORM_addr
940
+ .b8 18 // DW_AT_high_pc
941
+ .b8 1 // DW_FORM_addr
942
+ .b8 88 // DW_AT_call_file
943
+ .b8 11 // DW_FORM_data1
944
+ .b8 89 // DW_AT_call_line
945
+ .b8 11 // DW_FORM_data1
946
+ .b8 87 // DW_AT_call_column
947
+ .b8 11 // DW_FORM_data1
948
+ .b8 0 // EOM(1)
949
+ .b8 0 // EOM(2)
950
+ .b8 0 // EOM(3)
951
+ }
952
+ .section .debug_info
953
+ {
954
+ .b32 259 // Length of Unit
955
+ .b8 2 // DWARF version number
956
+ .b8 0
957
+ .b32 .debug_abbrev // Offset Into Abbrev. Section
958
+ .b8 8 // Address Size (in bytes)
959
+ .b8 1 // Abbrev [1] 0xb:0xfc DW_TAG_compile_unit
960
+ .b8 116 // DW_AT_producer
961
+ .b8 114
962
+ .b8 105
963
+ .b8 116
964
+ .b8 111
965
+ .b8 110
966
+ .b8 0
967
+ .b8 2 // DW_AT_language
968
+ .b8 0
969
+ .b8 99 // DW_AT_name
970
+ .b8 54
971
+ .b8 103
972
+ .b8 98
973
+ .b8 53
974
+ .b8 50
975
+ .b8 115
976
+ .b8 107
977
+ .b8 118
978
+ .b8 113
979
+ .b8 115
980
+ .b8 55
981
+ .b8 111
982
+ .b8 114
983
+ .b8 53
984
+ .b8 55
985
+ .b8 118
986
+ .b8 100
987
+ .b8 51
988
+ .b8 122
989
+ .b8 117
990
+ .b8 53
991
+ .b8 117
992
+ .b8 109
993
+ .b8 51
994
+ .b8 114
995
+ .b8 53
996
+ .b8 114
997
+ .b8 110
998
+ .b8 109
999
+ .b8 101
1000
+ .b8 105
1001
+ .b8 109
1002
+ .b8 100
1003
+ .b8 53
1004
+ .b8 113
1005
+ .b8 97
1006
+ .b8 109
1007
+ .b8 50
1008
+ .b8 55
1009
+ .b8 108
1010
+ .b8 53
1011
+ .b8 106
1012
+ .b8 55
1013
+ .b8 117
1014
+ .b8 113
1015
+ .b8 120
1016
+ .b8 55
1017
+ .b8 116
1018
+ .b8 52
1019
+ .b8 97
1020
+ .b8 105
1021
+ .b8 46
1022
+ .b8 112
1023
+ .b8 121
1024
+ .b8 0
1025
+ .b32 .debug_line // DW_AT_stmt_list
1026
+ .b8 47 // DW_AT_comp_dir
1027
+ .b8 119
1028
+ .b8 111
1029
+ .b8 114
1030
+ .b8 107
1031
+ .b8 115
1032
+ .b8 112
1033
+ .b8 97
1034
+ .b8 99
1035
+ .b8 101
1036
+ .b8 47
1037
+ .b8 104
1038
+ .b8 97
1039
+ .b8 110
1040
+ .b8 114
1041
+ .b8 117
1042
+ .b8 105
1043
+ .b8 47
1044
+ .b8 106
1045
+ .b8 117
1046
+ .b8 110
1047
+ .b8 113
1048
+ .b8 117
1049
+ .b8 97
1050
+ .b8 110
1051
+ .b8 47
1052
+ .b8 83
1053
+ .b8 112
1054
+ .b8 101
1055
+ .b8 99
1056
+ .b8 70
1057
+ .b8 111
1058
+ .b8 114
1059
+ .b8 103
1060
+ .b8 101
1061
+ .b8 47
1062
+ .b8 99
1063
+ .b8 97
1064
+ .b8 99
1065
+ .b8 104
1066
+ .b8 101
1067
+ .b8 47
1068
+ .b8 99
1069
+ .b8 111
1070
+ .b8 109
1071
+ .b8 112
1072
+ .b8 105
1073
+ .b8 108
1074
+ .b8 101
1075
+ .b8 100
1076
+ .b8 95
1077
+ .b8 107
1078
+ .b8 101
1079
+ .b8 114
1080
+ .b8 110
1081
+ .b8 101
1082
+ .b8 108
1083
+ .b8 115
1084
+ .b8 47
1085
+ .b8 54
1086
+ .b8 103
1087
+ .b8 0
1088
+ .b8 2 // Abbrev [2] 0x8f:0x19 DW_TAG_subprogram
1089
+ .b8 116 // DW_AT_name
1090
+ .b8 114
1091
+ .b8 105
1092
+ .b8 116
1093
+ .b8 111
1094
+ .b8 110
1095
+ .b8 95
1096
+ .b8 112
1097
+ .b8 101
1098
+ .b8 114
1099
+ .b8 95
1100
+ .b8 102
1101
+ .b8 117
1102
+ .b8 115
1103
+ .b8 101
1104
+ .b8 100
1105
+ .b8 95
1106
+ .b8 109
1107
+ .b8 117
1108
+ .b8 108
1109
+ .b8 95
1110
+ .b8 49
1111
+ .b8 0
1112
+ .b8 1 // DW_AT_inline
1113
+ .b8 3 // Abbrev [3] 0xa8:0x5e DW_TAG_subprogram
1114
+ .b64 $L__func_begin0 // DW_AT_low_pc
1115
+ .b64 $L__func_end0 // DW_AT_high_pc
1116
+ .b32 143 // DW_AT_abstract_origin
1117
+ .b8 4 // Abbrev [4] 0xbd:0x18 DW_TAG_inlined_subroutine
1118
+ .b32 143 // DW_AT_abstract_origin
1119
+ .b64 $L__tmp1 // DW_AT_low_pc
1120
+ .b64 $L__tmp2 // DW_AT_high_pc
1121
+ .b8 1 // DW_AT_call_file
1122
+ .b8 39 // DW_AT_call_line
1123
+ .b8 37 // DW_AT_call_column
1124
+ .b8 4 // Abbrev [4] 0xd5:0x18 DW_TAG_inlined_subroutine
1125
+ .b32 143 // DW_AT_abstract_origin
1126
+ .b64 $L__tmp3 // DW_AT_low_pc
1127
+ .b64 $L__tmp4 // DW_AT_high_pc
1128
+ .b8 1 // DW_AT_call_file
1129
+ .b8 49 // DW_AT_call_line
1130
+ .b8 26 // DW_AT_call_column
1131
+ .b8 4 // Abbrev [4] 0xed:0x18 DW_TAG_inlined_subroutine
1132
+ .b32 143 // DW_AT_abstract_origin
1133
+ .b64 $L__tmp5 // DW_AT_low_pc
1134
+ .b64 $L__tmp10 // DW_AT_high_pc
1135
+ .b8 1 // DW_AT_call_file
1136
+ .b8 34 // DW_AT_call_line
1137
+ .b8 51 // DW_AT_call_column
1138
+ .b8 0 // End Of Children Mark
1139
+ .b8 0 // End Of Children Mark
1140
+ }
1141
+ .section .debug_macinfo { }
progress/SpecForge/cache/compiled_kernels/triton/3/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.source ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":18:0)
2
+ #loc57 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":69:0)
3
+ #loc69 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":122:0)
4
+ #loc71 = loc(unknown)
5
+ #loc74 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":109:0)
6
+ #loc83 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0)
7
+ #loc87 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0)
8
+ #loc96 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0)
9
+ #loc100 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0)
10
+ #loc104 = loc("in_ptr0"(#loc))
11
+ #loc105 = loc("in_ptr1"(#loc))
12
+ #loc106 = loc("out_ptr0"(#loc))
13
+ #loc107 = loc("out_ptr1"(#loc))
14
+ #loc108 = loc("out_ptr2"(#loc))
15
+ #loc109 = loc("ks0"(#loc))
16
+ #loc110 = loc("xnumel"(#loc))
17
+ #loc111 = loc("r0_numel"(#loc))
18
+ #loc154 = loc("a"(#loc57))
19
+ #loc155 = loc("b"(#loc57))
20
+ #loc161 = loc("a"(#loc69))
21
+ #loc162 = loc("a"(#loc74))
22
+ #loc163 = loc("b"(#loc74))
23
+ #loc167 = loc("x"(#loc83))
24
+ #loc168 = loc("x"(#loc87))
25
+ #loc169 = loc("input"(#loc96))
26
+ #loc170 = loc("a"(#loc100))
27
+ #loc171 = loc("b"(#loc100))
28
+ module {
29
+ tt.func public @triton_per_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
30
+ %r0_numel_0 = arith.constant 32 : i32 loc(#loc112)
31
+ %xoffset = tt.get_program_id x : i32 loc(#loc113)
32
+ %xoffset_1 = arith.constant 128 : i32 loc(#loc114)
33
+ %xoffset_2 = arith.constant 128 : i32 loc(#loc114)
34
+ %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc114)
35
+ %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc115)
36
+ %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc116)
37
+ %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<128x1xi32> loc(#loc117)
38
+ %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<128x1xi32> loc(#loc117)
39
+ %xmask = tt.splat %xnumel : i32 -> tensor<128x1xi32> loc(#loc118)
40
+ %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<128x1xi32> loc(#loc118)
41
+ %r0_index = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc119)
42
+ %r0_index_8 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc120)
43
+ %r0_offset = arith.constant 0 : i32 loc(#loc121)
44
+ %r0_mask = arith.constant true loc(#loc122)
45
+ %r0_mask_9 = arith.constant dense<true> : tensor<128x32xi1> loc(#loc122)
46
+ %x2 = arith.extsi %xindex_6 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc123)
47
+ %x2_10 = tt.splat %ks0 : i64 -> tensor<128x1xi64> loc(#loc123)
48
+ %x2_11 = arith.remsi %x2, %x2_10 : tensor<128x1xi64> loc(#loc123)
49
+ %x3 = tt.call @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S128_1S_i64__(%xindex_6, %ks0) : (tensor<128x1xi32>, i64) -> tensor<128x1xi64> loc(#loc124)
50
+ %tmp0 = arith.constant 32 : i32 loc(#loc125)
51
+ %tmp0_12 = arith.constant 32 : i64 loc(#loc125)
52
+ %tmp0_13 = arith.muli %tmp0_12, %ks0 : i64 loc(#loc125)
53
+ %tmp0_14 = arith.extsi %r0_index_8 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc126)
54
+ %tmp0_15 = tt.splat %tmp0_13 : i64 -> tensor<1x32xi64> loc(#loc126)
55
+ %tmp0_16 = arith.muli %tmp0_15, %tmp0_14 : tensor<1x32xi64> loc(#loc126)
56
+ %tmp0_17 = arith.extsi %xindex_6 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc127)
57
+ %tmp0_18 = tt.broadcast %tmp0_17 : tensor<128x1xi64> -> tensor<128x32xi64> loc(#loc127)
58
+ %tmp0_19 = tt.broadcast %tmp0_16 : tensor<1x32xi64> -> tensor<128x32xi64> loc(#loc127)
59
+ %tmp0_20 = arith.addi %tmp0_18, %tmp0_19 : tensor<128x32xi64> loc(#loc127)
60
+ %tmp0_21 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x32x!tt.ptr<f32>> loc(#loc128)
61
+ %tmp0_22 = tt.addptr %tmp0_21, %tmp0_20 : tensor<128x32x!tt.ptr<f32>>, tensor<128x32xi64> loc(#loc128)
62
+ %tmp0_23 = arith.constant 0.000000e+00 : f32 loc(#loc129)
63
+ %tmp0_24 = tt.broadcast %xmask_7 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc129)
64
+ %tmp0_25 = arith.constant dense<0.000000e+00> : tensor<128x32xf32> loc(#loc129)
65
+ %tmp0_26 = tt.load %tmp0_22, %tmp0_24, %tmp0_25 : tensor<128x32x!tt.ptr<f32>> loc(#loc129)
66
+ %tmp5 = arith.constant 32 : i32 loc(#loc130)
67
+ %tmp5_27 = arith.constant 32 : i64 loc(#loc130)
68
+ %tmp5_28 = arith.muli %tmp5_27, %ks0 : i64 loc(#loc130)
69
+ %tmp5_29 = arith.extsi %r0_index_8 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc131)
70
+ %tmp5_30 = tt.splat %tmp5_28 : i64 -> tensor<1x32xi64> loc(#loc131)
71
+ %tmp5_31 = arith.muli %tmp5_30, %tmp5_29 : tensor<1x32xi64> loc(#loc131)
72
+ %tmp5_32 = arith.extsi %xindex_6 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc132)
73
+ %tmp5_33 = tt.broadcast %tmp5_32 : tensor<128x1xi64> -> tensor<128x32xi64> loc(#loc132)
74
+ %tmp5_34 = tt.broadcast %tmp5_31 : tensor<1x32xi64> -> tensor<128x32xi64> loc(#loc132)
75
+ %tmp5_35 = arith.addi %tmp5_33, %tmp5_34 : tensor<128x32xi64> loc(#loc132)
76
+ %tmp5_36 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<128x32x!tt.ptr<f32>> loc(#loc133)
77
+ %tmp5_37 = tt.addptr %tmp5_36, %tmp5_35 : tensor<128x32x!tt.ptr<f32>>, tensor<128x32xi64> loc(#loc133)
78
+ %tmp5_38 = arith.constant 0.000000e+00 : f32 loc(#loc134)
79
+ %tmp5_39 = tt.broadcast %xmask_7 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc134)
80
+ %tmp5_40 = arith.constant dense<0.000000e+00> : tensor<128x32xf32> loc(#loc134)
81
+ %tmp5_41 = tt.load %tmp5_37, %tmp5_39, %tmp5_40 : tensor<128x32x!tt.ptr<f32>> loc(#loc134)
82
+ %tmp3 = arith.constant 0xFF800000 : f32 loc(#loc135)
83
+ %tmp3_42 = arith.constant 0xFF800000 : f32 loc(#loc135)
84
+ %tmp3_43 = arith.constant dense<0xFF800000> : tensor<128x32xf32> loc(#loc135)
85
+ %tmp3_44 = tt.broadcast %xmask_7 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc135)
86
+ %tmp3_45 = arith.select %tmp3_44, %tmp0_26, %tmp3_43 : tensor<128x32xi1>, tensor<128x32xf32> loc(#loc135)
87
+ %tmp4 = tt.call @"torch._inductor.runtime.triton_helpers.max2__fp32S128_32S__(1,)cconstexpr_1_"(%tmp3_45) : (tensor<128x32xf32>) -> tensor<128xf32> loc(#loc136)
88
+ %tmp4_46 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc137)
89
+ %tmp6 = arith.constant 0xFF800000 : f32 loc(#loc138)
90
+ %tmp7 = arith.constant dense<0xFF800000> : tensor<128x1xf32> loc(#loc139)
91
+ %tmp7_47 = arith.cmpf oeq, %tmp4_46, %tmp7 : tensor<128x1xf32> loc(#loc139)
92
+ %tmp8 = tt.broadcast %tmp4_46 : tensor<128x1xf32> -> tensor<128x32xf32> loc(#loc140)
93
+ %tmp8_48 = arith.subf %tmp0_26, %tmp8 : tensor<128x32xf32> loc(#loc140)
94
+ %tmp9 = arith.constant 0.000000e+00 : f32 loc(#loc141)
95
+ %tmp10 = arith.constant dense<0.000000e+00> : tensor<128x32xf32> loc(#loc142)
96
+ %tmp10_49 = tt.broadcast %tmp7_47 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc142)
97
+ %tmp10_50 = arith.select %tmp10_49, %tmp10, %tmp8_48 : tensor<128x32xi1>, tensor<128x32xf32> loc(#loc142)
98
+ %tmp11 = tt.extern_elementwise %tmp10_50 {libname = "", libpath = "", pure = true, symbol = "__nv_exp2f"} : (tensor<128x32xf32>) -> tensor<128x32xf32> loc(#loc143)
99
+ %tmp12 = arith.mulf %tmp5_41, %tmp11 : tensor<128x32xf32> loc(#loc144)
100
+ %tmp15 = arith.constant 0 : i32 loc(#loc145)
101
+ %tmp15_51 = arith.constant 0.000000e+00 : f32 loc(#loc145)
102
+ %tmp15_52 = arith.constant dense<0.000000e+00> : tensor<128x32xf32> loc(#loc145)
103
+ %tmp15_53 = tt.broadcast %xmask_7 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc145)
104
+ %tmp15_54 = arith.select %tmp15_53, %tmp12, %tmp15_52 : tensor<128x32xi1>, tensor<128x32xf32> loc(#loc145)
105
+ %tmp16 = tt.call @"triton.language.standard.sum__fp32S128_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp15_54) : (tensor<128x32xf32>) -> tensor<128xf32> loc(#loc146)
106
+ %tmp16_55 = tt.expand_dims %tmp16 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc147)
107
+ %tmp17 = arith.constant 1.000000e+00 : f32 loc(#loc148)
108
+ %tmp18 = arith.constant dense<1.000000e+00> : tensor<128x1xf32> loc(#loc149)
109
+ %tmp18_56 = arith.select %tmp7_47, %tmp18, %tmp16_55 : tensor<128x1xi1>, tensor<128x1xf32> loc(#loc149)
110
+ %tmp19 = tt.extern_elementwise %tmp18_56 {libname = "", libpath = "", pure = true, symbol = "__nv_log2f"} : (tensor<128x1xf32>) -> tensor<128x1xf32> loc(#loc150)
111
+ %tmp20 = arith.addf %tmp19, %tmp4_46 : tensor<128x1xf32> loc(#loc151)
112
+ %tmp21 = arith.constant 0.693147182 : f32 loc(#loc152)
113
+ %tmp22 = arith.constant dense<0.693147182> : tensor<128x1xf32> loc(#loc153)
114
+ %tmp22_57 = arith.mulf %tmp20, %tmp22 : tensor<128x1xf32> loc(#loc153)
115
+ %c1_i32 = arith.constant 1 : i32 loc(#loc43)
116
+ %0 = arith.extsi %c1_i32 : i32 to i64 loc(#loc43)
117
+ %1 = arith.cmpi sge, %0, %ks0 : i64 loc(#loc43)
118
+ %c1_i32_58 = arith.constant 1 : i32 loc(#loc44)
119
+ %c1_i32_59 = arith.constant 1 : i32 loc(#loc44)
120
+ %2 = arith.extui %1 : i1 to i32 loc(#loc44)
121
+ %3 = arith.muli %c1_i32_59, %2 : i32 loc(#loc44)
122
+ %c1_i32_60 = arith.constant 1 : i32 loc(#loc45)
123
+ %4 = arith.extsi %c1_i32_60 : i32 to i64 loc(#loc45)
124
+ %5 = arith.cmpi sgt, %ks0, %4 : i64 loc(#loc45)
125
+ %6 = arith.extui %5 : i1 to i64 loc(#loc46)
126
+ %7 = arith.muli %ks0, %6 : i64 loc(#loc46)
127
+ %8 = arith.extsi %3 : i32 to i64 loc(#loc47)
128
+ %9 = arith.addi %8, %7 : i64 loc(#loc47)
129
+ %10 = tt.splat %9 : i64 -> tensor<128x1xi64> loc(#loc48)
130
+ %11 = arith.muli %x3, %10 : tensor<128x1xi64> loc(#loc48)
131
+ %12 = arith.addi %x2_11, %11 : tensor<128x1xi64> loc(#loc49)
132
+ %13 = tt.splat %out_ptr2 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>> loc(#loc50)
133
+ %14 = tt.addptr %13, %12 : tensor<128x1x!tt.ptr<f32>>, tensor<128x1xi64> loc(#loc50)
134
+ tt.store %14, %tmp22_57, %xmask_7 : tensor<128x1x!tt.ptr<f32>> loc(#loc51)
135
+ %15 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>> loc(#loc52)
136
+ %16 = tt.addptr %15, %xindex_6 : tensor<128x1x!tt.ptr<f32>>, tensor<128x1xi32> loc(#loc52)
137
+ tt.store %16, %tmp4_46, %xmask_7 : tensor<128x1x!tt.ptr<f32>> loc(#loc53)
138
+ %17 = tt.splat %out_ptr1 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>> loc(#loc54)
139
+ %18 = tt.addptr %17, %xindex_6 : tensor<128x1x!tt.ptr<f32>>, tensor<128x1xi32> loc(#loc54)
140
+ tt.store %18, %tmp16_55, %xmask_7 : tensor<128x1x!tt.ptr<f32>> loc(#loc55)
141
+ tt.return loc(#loc56)
142
+ } loc(#loc)
143
+ tt.func private @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S128_1S_i64__(%a: tensor<128x1xi32> loc("a"(#loc57)), %b: i64 loc("b"(#loc57))) -> tensor<128x1xi64> attributes {noinline = false} {
144
+ %quot = arith.extsi %a : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc156)
145
+ %quot_0 = tt.splat %b : i64 -> tensor<128x1xi64> loc(#loc156)
146
+ %quot_1 = arith.divsi %quot, %quot_0 : tensor<128x1xi64> loc(#loc156)
147
+ %remainder = arith.extsi %a : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc157)
148
+ %remainder_2 = tt.splat %b : i64 -> tensor<128x1xi64> loc(#loc157)
149
+ %remainder_3 = arith.remsi %remainder, %remainder_2 : tensor<128x1xi64> loc(#loc157)
150
+ %fixed = arith.constant 0 : i32 loc(#loc158)
151
+ %fixed_4 = arith.extsi %fixed : i32 to i64 loc(#loc158)
152
+ %fixed_5 = tt.splat %fixed_4 : i64 -> tensor<128x1xi64> loc(#loc158)
153
+ %fixed_6 = arith.cmpi ne, %remainder_3, %fixed_5 : tensor<128x1xi64> loc(#loc158)
154
+ %fixed_7 = arith.constant 1 : i32 loc(#loc159)
155
+ %fixed_8 = arith.constant 1 : i64 loc(#loc159)
156
+ %fixed_9 = arith.constant dense<1> : tensor<128x1xi64> loc(#loc159)
157
+ %fixed_10 = arith.subi %quot_1, %fixed_9 : tensor<128x1xi64> loc(#loc159)
158
+ %fixed_11 = arith.select %fixed_6, %fixed_10, %quot_1 : tensor<128x1xi1>, tensor<128x1xi64> loc(#loc160)
159
+ %c0_i32 = arith.constant 0 : i32 loc(#loc63)
160
+ %cst = arith.constant dense<0> : tensor<128x1xi32> loc(#loc63)
161
+ %0 = arith.cmpi slt, %a, %cst : tensor<128x1xi32> loc(#loc63)
162
+ %c0_i32_12 = arith.constant 0 : i32 loc(#loc64)
163
+ %1 = arith.extsi %c0_i32_12 : i32 to i64 loc(#loc64)
164
+ %2 = arith.cmpi slt, %b, %1 : i64 loc(#loc64)
165
+ %3 = tt.splat %2 : i1 -> tensor<128x1xi1> loc(#loc65)
166
+ %4 = arith.cmpi ne, %0, %3 : tensor<128x1xi1> loc(#loc65)
167
+ %5 = arith.select %4, %fixed_11, %quot_1 : tensor<128x1xi1>, tensor<128x1xi64> loc(#loc66)
168
+ tt.return %5 : tensor<128x1xi64> loc(#loc67)
169
+ ^bb1: // no predecessors
170
+ %6 = ub.poison : tensor<128x1xi64> loc(#loc68)
171
+ tt.return %6 : tensor<128x1xi64> loc(#loc68)
172
+ } loc(#loc57)
173
+ tt.func private @"torch._inductor.runtime.triton_helpers.max2__fp32S128_32S__(1,)cconstexpr_1_"(%a: tensor<128x32xf32> loc("a"(#loc69))) -> tensor<128xf32> attributes {noinline = false} {
174
+ %0 = "tt.reduce"(%a) <{axis = 1 : i32}> ({
175
+ ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
176
+ %2 = tt.call @torch._inductor.runtime.triton_helpers.maximum__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc70)
177
+ tt.reduce.return %2 : f32 loc(#loc70)
178
+ }) : (tensor<128x32xf32>) -> tensor<128xf32> loc(#loc70)
179
+ tt.return %0 : tensor<128xf32> loc(#loc72)
180
+ ^bb1: // no predecessors
181
+ %1 = ub.poison : tensor<128xf32> loc(#loc73)
182
+ tt.return %1 : tensor<128xf32> loc(#loc73)
183
+ } loc(#loc69)
184
+ tt.func private @torch._inductor.runtime.triton_helpers.maximum__fp32_fp32__(%a: f32 loc("a"(#loc74)), %b: f32 loc("b"(#loc74))) -> f32 attributes {noinline = false} {
185
+ %mask = arith.cmpf ogt, %a, %b : f32 loc(#loc172)
186
+ %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a) : (f32) -> i1 loc(#loc76)
187
+ %1 = scf.if %0 -> (i1) {
188
+ %mask_0 = arith.cmpf une, %a, %a : f32 loc(#loc165)
189
+ %mask_1 = arith.ori %mask, %mask_0 : i1 loc(#loc173)
190
+ scf.yield %mask_1 : i1 loc(#loc173)
191
+ } else {
192
+ scf.yield %mask : i1 loc(#loc71)
193
+ } loc(#loc77)
194
+ %2 = arith.select %1, %a, %b : f32 loc(#loc80)
195
+ tt.return %2 : f32 loc(#loc81)
196
+ ^bb1: // no predecessors
197
+ %3 = ub.poison : f32 loc(#loc82)
198
+ tt.return %3 : f32 loc(#loc82)
199
+ } loc(#loc74)
200
+ tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc83))) -> i1 attributes {noinline = false} {
201
+ %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc84)
202
+ %true = arith.constant true loc(#loc85)
203
+ tt.return %true : i1 loc(#loc85)
204
+ ^bb1: // no predecessors
205
+ %1 = ub.poison : i1 loc(#loc86)
206
+ tt.return %1 : i1 loc(#loc86)
207
+ } loc(#loc83)
208
+ tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc87))) -> tensor<1xf32> attributes {noinline = false} {
209
+ %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc88)
210
+ %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc89)
211
+ %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc89)
212
+ %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc89)
213
+ tt.return %3 : tensor<1xf32> loc(#loc90)
214
+ ^bb1: // no predecessors
215
+ %4 = ub.poison : tensor<1xf32> loc(#loc91)
216
+ tt.return %4 : tensor<1xf32> loc(#loc91)
217
+ } loc(#loc87)
218
+ tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} {
219
+ %false = arith.constant false loc(#loc93)
220
+ %cst = arith.constant dense<false> : tensor<1xi1> loc(#loc93)
221
+ tt.return %cst : tensor<1xi1> loc(#loc94)
222
+ ^bb1: // no predecessors
223
+ %0 = ub.poison : tensor<1xi1> loc(#loc95)
224
+ tt.return %0 : tensor<1xi1> loc(#loc95)
225
+ } loc(#loc92)
226
+ tt.func private @"triton.language.standard.sum__fp32S128_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x32xf32> loc("input"(#loc96))) -> tensor<128xf32> attributes {noinline = false} {
227
+ %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
228
+ ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
229
+ %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc97)
230
+ tt.reduce.return %2 : f32 loc(#loc97)
231
+ }) : (tensor<128x32xf32>) -> tensor<128xf32> loc(#loc97)
232
+ tt.return %0 : tensor<128xf32> loc(#loc98)
233
+ ^bb1: // no predecessors
234
+ %1 = ub.poison : tensor<128xf32> loc(#loc99)
235
+ tt.return %1 : tensor<128xf32> loc(#loc99)
236
+ } loc(#loc96)
237
+ tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc100)), %b: f32 loc("b"(#loc100))) -> f32 attributes {noinline = false} {
238
+ %0 = arith.addf %a, %b : f32 loc(#loc101)
239
+ tt.return %0 : f32 loc(#loc102)
240
+ ^bb1: // no predecessors
241
+ %1 = ub.poison : f32 loc(#loc103)
242
+ tt.return %1 : f32 loc(#loc103)
243
+ } loc(#loc100)
244
+ } loc(#loc)
245
+ #loc1 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":19:15)
246
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":23:28)
247
+ #loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":23:33)
248
+ #loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":24:36)
249
+ #loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":24:44)
250
+ #loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":24:23)
251
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":25:21)
252
+ #loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":26:28)
253
+ #loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":26:38)
254
+ #loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":27:16)
255
+ #loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":28:48)
256
+ #loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":33:19)
257
+ #loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":34:51)
258
+ #loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":35:38)
259
+ #loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":35:42)
260
+ #loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":35:35)
261
+ #loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":35:30)
262
+ #loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":35:49)
263
+ #loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":36:38)
264
+ #loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":36:42)
265
+ #loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":36:35)
266
+ #loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":36:30)
267
+ #loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":36:49)
268
+ #loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":38:33)
269
+ #loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":39:37)
270
+ #loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":39:40)
271
+ #loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":40:11)
272
+ #loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":41:19)
273
+ #loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":42:18)
274
+ #loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":43:11)
275
+ #loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":44:33)
276
+ #loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":45:27)
277
+ #loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":46:19)
278
+ #loc34 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":48:35)
279
+ #loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":49:26)
280
+ #loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":49:29)
281
+ #loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":50:12)
282
+ #loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":51:34)
283
+ #loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":52:27)
284
+ #loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":53:20)
285
+ #loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":54:12)
286
+ #loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":55:20)
287
+ #loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:49)
288
+ #loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:41)
289
+ #loc45 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:75)
290
+ #loc46 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:66)
291
+ #loc47 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:57)
292
+ #loc48 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:34)
293
+ #loc49 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:30)
294
+ #loc50 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:25)
295
+ #loc51 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:89)
296
+ #loc52 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":57:25)
297
+ #loc53 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":57:36)
298
+ #loc54 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":58:25)
299
+ #loc55 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":58:37)
300
+ #loc56 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":58:4)
301
+ #loc58 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
302
+ #loc59 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20)
303
+ #loc60 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
304
+ #loc61 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
305
+ #loc62 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
306
+ #loc63 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
307
+ #loc64 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
308
+ #loc65 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
309
+ #loc66 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
310
+ #loc67 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:11)
311
+ #loc68 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:4)
312
+ #loc70 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29)
313
+ #loc72 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:11)
314
+ #loc73 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:4)
315
+ #loc75 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15)
316
+ #loc76 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:19)
317
+ #loc77 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:7)
318
+ #loc78 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21)
319
+ #loc79 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16)
320
+ #loc80 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29)
321
+ #loc81 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:11)
322
+ #loc82 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:4)
323
+ #loc84 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29)
324
+ #loc85 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11)
325
+ #loc86 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4)
326
+ #loc88 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30)
327
+ #loc89 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15)
328
+ #loc90 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11)
329
+ #loc91 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4)
330
+ #loc92 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0)
331
+ #loc93 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31)
332
+ #loc94 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11)
333
+ #loc95 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4)
334
+ #loc97 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
335
+ #loc98 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11)
336
+ #loc99 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4)
337
+ #loc101 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
338
+ #loc102 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11)
339
+ #loc103 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4)
340
+ #loc112 = loc("r0_numel"(#loc1))
341
+ #loc113 = loc("xoffset"(#loc2))
342
+ #loc114 = loc("xoffset"(#loc3))
343
+ #loc115 = loc("xindex"(#loc4))
344
+ #loc116 = loc("xindex"(#loc5))
345
+ #loc117 = loc("xindex"(#loc6))
346
+ #loc118 = loc("xmask"(#loc7))
347
+ #loc119 = loc("r0_index"(#loc8))
348
+ #loc120 = loc("r0_index"(#loc9))
349
+ #loc121 = loc("r0_offset"(#loc10))
350
+ #loc122 = loc("r0_mask"(#loc11))
351
+ #loc123 = loc("x2"(#loc12))
352
+ #loc124 = loc("x3"(#loc13))
353
+ #loc125 = loc("tmp0"(#loc14))
354
+ #loc126 = loc("tmp0"(#loc15))
355
+ #loc127 = loc("tmp0"(#loc16))
356
+ #loc128 = loc("tmp0"(#loc17))
357
+ #loc129 = loc("tmp0"(#loc18))
358
+ #loc130 = loc("tmp5"(#loc19))
359
+ #loc131 = loc("tmp5"(#loc20))
360
+ #loc132 = loc("tmp5"(#loc21))
361
+ #loc133 = loc("tmp5"(#loc22))
362
+ #loc134 = loc("tmp5"(#loc23))
363
+ #loc135 = loc("tmp3"(#loc24))
364
+ #loc136 = loc("tmp4"(#loc25))
365
+ #loc137 = loc("tmp4"(#loc26))
366
+ #loc138 = loc("tmp6"(#loc27))
367
+ #loc139 = loc("tmp7"(#loc28))
368
+ #loc140 = loc("tmp8"(#loc29))
369
+ #loc141 = loc("tmp9"(#loc30))
370
+ #loc142 = loc("tmp10"(#loc31))
371
+ #loc143 = loc("tmp11"(#loc32))
372
+ #loc144 = loc("tmp12"(#loc33))
373
+ #loc145 = loc("tmp15"(#loc34))
374
+ #loc146 = loc("tmp16"(#loc35))
375
+ #loc147 = loc("tmp16"(#loc36))
376
+ #loc148 = loc("tmp17"(#loc37))
377
+ #loc149 = loc("tmp18"(#loc38))
378
+ #loc150 = loc("tmp19"(#loc39))
379
+ #loc151 = loc("tmp20"(#loc40))
380
+ #loc152 = loc("tmp21"(#loc41))
381
+ #loc153 = loc("tmp22"(#loc42))
382
+ #loc156 = loc("quot"(#loc58))
383
+ #loc157 = loc("remainder"(#loc59))
384
+ #loc158 = loc("fixed"(#loc60))
385
+ #loc159 = loc("fixed"(#loc61))
386
+ #loc160 = loc("fixed"(#loc62))
387
+ #loc164 = loc("mask"(#loc75))
388
+ #loc165 = loc("mask"(#loc78))
389
+ #loc166 = loc("mask"(#loc79))
390
+ #loc172 = loc("mask"(#loc164))
391
+ #loc173 = loc("mask"(#loc166))
progress/SpecForge/cache/compiled_kernels/triton/3/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.ttgir ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 2], order = [0, 1]}>
2
+ #blocked1 = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 8], order = [0, 1]}>
3
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":18:0)
4
+ #loc1 = loc(unknown)
5
+ #loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":39:37)
6
+ #loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":49:26)
7
+ #loc61 = loc("in_ptr0"(#loc))
8
+ #loc62 = loc("in_ptr1"(#loc))
9
+ #loc63 = loc("out_ptr0"(#loc))
10
+ #loc64 = loc("out_ptr1"(#loc))
11
+ #loc65 = loc("out_ptr2"(#loc))
12
+ #loc66 = loc("ks0"(#loc))
13
+ #loc67 = loc("xnumel"(#loc))
14
+ #loc68 = loc("r0_numel"(#loc))
15
+ #loc89 = loc("tmp4"(#loc27))
16
+ #loc100 = loc("tmp16"(#loc40))
17
+ #loc116 = loc(callsite(#loc1 at #loc89))
18
+ #loc120 = loc(callsite(#loc1 at #loc100))
19
+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
20
+ tt.func public @triton_per_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
21
+ %cst = arith.constant dense<0> : tensor<128x1xi64, #blocked> loc(#loc1)
22
+ %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #blocked> loc(#loc1)
23
+ %cst_1 = arith.constant dense<1> : tensor<128x1xi64, #blocked> loc(#loc1)
24
+ %c32_i64 = arith.constant 32 : i64 loc(#loc1)
25
+ %c128_i32 = arith.constant 128 : i32 loc(#loc1)
26
+ %c0_i64 = arith.constant 0 : i64 loc(#loc1)
27
+ %c1_i64 = arith.constant 1 : i64 loc(#loc1)
28
+ %cst_2 = arith.constant dense<0.693147182> : tensor<128x1xf32, #blocked1> loc(#loc1)
29
+ %cst_3 = arith.constant dense<1.000000e+00> : tensor<128x1xf32, #blocked1> loc(#loc1)
30
+ %cst_4 = arith.constant dense<0xFF800000> : tensor<128x1xf32, #blocked1> loc(#loc1)
31
+ %cst_5 = arith.constant dense<0xFF800000> : tensor<128x32xf32, #blocked1> loc(#loc1)
32
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<128x32xf32, #blocked1> loc(#loc1)
33
+ %xoffset = tt.get_program_id x : i32 loc(#loc69)
34
+ %xoffset_7 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc70)
35
+ %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc71)
36
+ %xindex_8 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc71)
37
+ %xindex_9 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xi32, #blocked1> loc(#loc71)
38
+ %xindex_10 = tt.expand_dims %xindex_8 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc71)
39
+ %xindex_11 = tt.splat %xoffset_7 : i32 -> tensor<128x1xi32, #blocked1> loc(#loc72)
40
+ %xindex_12 = tt.splat %xoffset_7 : i32 -> tensor<128x1xi32, #blocked> loc(#loc72)
41
+ %xindex_13 = arith.addi %xindex_11, %xindex_9 : tensor<128x1xi32, #blocked1> loc(#loc72)
42
+ %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<128x1xi32, #blocked> loc(#loc72)
43
+ %xmask = tt.splat %xnumel : i32 -> tensor<128x1xi32, #blocked1> loc(#loc73)
44
+ %xmask_15 = tt.splat %xnumel : i32 -> tensor<128x1xi32, #blocked> loc(#loc73)
45
+ %xmask_16 = arith.cmpi slt, %xindex_13, %xmask : tensor<128x1xi32, #blocked1> loc(#loc73)
46
+ %xmask_17 = arith.cmpi slt, %xindex_14, %xmask_15 : tensor<128x1xi32, #blocked> loc(#loc73)
47
+ %r0_index = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc74)
48
+ %r0_index_18 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> loc(#loc74)
49
+ %x2 = arith.extsi %xindex_13 : tensor<128x1xi32, #blocked1> to tensor<128x1xi64, #blocked1> loc(#loc75)
50
+ %x2_19 = arith.extsi %xindex_14 : tensor<128x1xi32, #blocked> to tensor<128x1xi64, #blocked> loc(#loc75)
51
+ %x2_20 = tt.splat %ks0 : i64 -> tensor<128x1xi64, #blocked> loc(#loc75)
52
+ %x2_21 = arith.remsi %x2_19, %x2_20 : tensor<128x1xi64, #blocked> loc(#loc75)
53
+ %quot = arith.divsi %x2_19, %x2_20 : tensor<128x1xi64, #blocked> loc(#loc107)
54
+ %fixed = arith.cmpi ne, %x2_21, %cst : tensor<128x1xi64, #blocked> loc(#loc108)
55
+ %fixed_22 = arith.subi %quot, %cst_1 : tensor<128x1xi64, #blocked> loc(#loc109)
56
+ %fixed_23 = arith.select %fixed, %fixed_22, %quot : tensor<128x1xi1, #blocked>, tensor<128x1xi64, #blocked> loc(#loc110)
57
+ %x3 = arith.cmpi slt, %xindex_14, %cst_0 : tensor<128x1xi32, #blocked> loc(#loc111)
58
+ %x3_24 = arith.cmpi slt, %ks0, %c0_i64 : i64 loc(#loc112)
59
+ %x3_25 = tt.splat %x3_24 : i1 -> tensor<128x1xi1, #blocked> loc(#loc113)
60
+ %x3_26 = arith.cmpi ne, %x3, %x3_25 : tensor<128x1xi1, #blocked> loc(#loc113)
61
+ %x3_27 = arith.select %x3_26, %fixed_23, %quot : tensor<128x1xi1, #blocked>, tensor<128x1xi64, #blocked> loc(#loc114)
62
+ %tmp0 = arith.muli %ks0, %c32_i64 : i64 loc(#loc81)
63
+ %tmp0_28 = arith.extsi %r0_index_18 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc82)
64
+ %tmp0_29 = tt.splat %tmp0 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc82)
65
+ %tmp0_30 = arith.muli %tmp0_29, %tmp0_28 : tensor<1x32xi64, #blocked1> loc(#loc82)
66
+ %tmp0_31 = tt.broadcast %x2 : tensor<128x1xi64, #blocked1> -> tensor<128x32xi64, #blocked1> loc(#loc83)
67
+ %tmp0_32 = tt.broadcast %tmp0_30 : tensor<1x32xi64, #blocked1> -> tensor<128x32xi64, #blocked1> loc(#loc83)
68
+ %tmp0_33 = arith.addi %tmp0_31, %tmp0_32 : tensor<128x32xi64, #blocked1> loc(#loc83)
69
+ %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x32x!tt.ptr<f32>, #blocked1> loc(#loc84)
70
+ %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<128x32x!tt.ptr<f32>, #blocked1>, tensor<128x32xi64, #blocked1> loc(#loc84)
71
+ %tmp0_36 = tt.broadcast %xmask_16 : tensor<128x1xi1, #blocked1> -> tensor<128x32xi1, #blocked1> loc(#loc85)
72
+ %tmp0_37 = tt.load %tmp0_35, %tmp0_36, %cst_6 : tensor<128x32x!tt.ptr<f32>, #blocked1> loc(#loc85)
73
+ %tmp5 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<128x32x!tt.ptr<f32>, #blocked1> loc(#loc86)
74
+ %tmp5_38 = tt.addptr %tmp5, %tmp0_33 : tensor<128x32x!tt.ptr<f32>, #blocked1>, tensor<128x32xi64, #blocked1> loc(#loc86)
75
+ %tmp5_39 = tt.load %tmp5_38, %tmp0_36, %cst_6 : tensor<128x32x!tt.ptr<f32>, #blocked1> loc(#loc87)
76
+ %tmp3 = arith.select %tmp0_36, %tmp0_37, %cst_5 : tensor<128x32xi1, #blocked1>, tensor<128x32xf32, #blocked1> loc(#loc88)
77
+ %tmp4 = "tt.reduce"(%tmp3) <{axis = 1 : i32}> ({
78
+ ^bb0(%tmp4_48: f32 loc(callsite(#loc1 at #loc89)), %tmp4_49: f32 loc(callsite(#loc1 at #loc89))):
79
+ %mask = arith.cmpf ogt, %tmp4_48, %tmp4_49 : f32 loc(#loc121)
80
+ %mask_50 = arith.cmpf une, %tmp4_48, %tmp4_48 : f32 loc(#loc122)
81
+ %mask_51 = arith.ori %mask, %mask_50 : i1 loc(#loc123)
82
+ %tmp4_52 = arith.select %mask_51, %tmp4_48, %tmp4_49 : f32 loc(#loc124)
83
+ tt.reduce.return %tmp4_52 : f32 loc(#loc115)
84
+ }) : (tensor<128x32xf32, #blocked1>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc115)
85
+ %tmp4_40 = ttg.convert_layout %tmp4 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc93)
86
+ %tmp4_41 = tt.expand_dims %tmp4_40 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xf32, #blocked> loc(#loc93)
87
+ %tmp4_42 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xf32, #blocked1> loc(#loc93)
88
+ %tmp7 = arith.cmpf oeq, %tmp4_42, %cst_4 : tensor<128x1xf32, #blocked1> loc(#loc94)
89
+ %tmp8 = tt.broadcast %tmp4_42 : tensor<128x1xf32, #blocked1> -> tensor<128x32xf32, #blocked1> loc(#loc95)
90
+ %tmp8_43 = arith.subf %tmp0_37, %tmp8 : tensor<128x32xf32, #blocked1> loc(#loc95)
91
+ %tmp10 = tt.broadcast %tmp7 : tensor<128x1xi1, #blocked1> -> tensor<128x32xi1, #blocked1> loc(#loc96)
92
+ %tmp10_44 = arith.select %tmp10, %cst_6, %tmp8_43 : tensor<128x32xi1, #blocked1>, tensor<128x32xf32, #blocked1> loc(#loc96)
93
+ %tmp11 = tt.extern_elementwise %tmp10_44 {libname = "", libpath = "", pure = true, symbol = "__nv_exp2f"} : (tensor<128x32xf32, #blocked1>) -> tensor<128x32xf32, #blocked1> loc(#loc97)
94
+ %tmp12 = arith.mulf %tmp5_39, %tmp11 : tensor<128x32xf32, #blocked1> loc(#loc98)
95
+ %tmp15 = arith.select %tmp0_36, %tmp12, %cst_6 : tensor<128x32xi1, #blocked1>, tensor<128x32xf32, #blocked1> loc(#loc99)
96
+ %tmp16 = "tt.reduce"(%tmp15) <{axis = 1 : i32}> ({
97
+ ^bb0(%tmp16_48: f32 loc(callsite(#loc1 at #loc100)), %tmp16_49: f32 loc(callsite(#loc1 at #loc100))):
98
+ %tmp16_50 = arith.addf %tmp16_48, %tmp16_49 : f32 loc(#loc125)
99
+ tt.reduce.return %tmp16_50 : f32 loc(#loc119)
100
+ }) : (tensor<128x32xf32, #blocked1>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc119)
101
+ %tmp16_45 = ttg.convert_layout %tmp16 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc101)
102
+ %tmp16_46 = tt.expand_dims %tmp16_45 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xf32, #blocked> loc(#loc101)
103
+ %tmp16_47 = tt.expand_dims %tmp16 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xf32, #blocked1> loc(#loc101)
104
+ %tmp18 = arith.select %tmp7, %cst_3, %tmp16_47 : tensor<128x1xi1, #blocked1>, tensor<128x1xf32, #blocked1> loc(#loc102)
105
+ %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_log2f"} : (tensor<128x1xf32, #blocked1>) -> tensor<128x1xf32, #blocked1> loc(#loc103)
106
+ %tmp20 = arith.addf %tmp19, %tmp4_42 : tensor<128x1xf32, #blocked1> loc(#loc104)
107
+ %tmp22 = arith.mulf %tmp20, %cst_2 : tensor<128x1xf32, #blocked1> loc(#loc105)
108
+ %0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc47)
109
+ %1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc48)
110
+ %2 = arith.extui %1 : i1 to i64 loc(#loc49)
111
+ %3 = arith.muli %ks0, %2 : i64 loc(#loc49)
112
+ %4 = arith.extui %0 : i1 to i64 loc(#loc106)
113
+ %5 = arith.addi %4, %3 : i64 loc(#loc50)
114
+ %6 = tt.splat %5 : i64 -> tensor<128x1xi64, #blocked> loc(#loc52)
115
+ %7 = arith.muli %x3_27, %6 : tensor<128x1xi64, #blocked> loc(#loc52)
116
+ %8 = arith.addi %x2_21, %7 : tensor<128x1xi64, #blocked> loc(#loc53)
117
+ %9 = tt.splat %out_ptr2 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>, #blocked> loc(#loc54)
118
+ %10 = tt.addptr %9, %8 : tensor<128x1x!tt.ptr<f32>, #blocked>, tensor<128x1xi64, #blocked> loc(#loc54)
119
+ %11 = ttg.convert_layout %tmp22 : tensor<128x1xf32, #blocked1> -> tensor<128x1xf32, #blocked> loc(#loc55)
120
+ tt.store %10, %11, %xmask_17 : tensor<128x1x!tt.ptr<f32>, #blocked> loc(#loc55)
121
+ %12 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>, #blocked> loc(#loc56)
122
+ %13 = tt.addptr %12, %xindex_14 : tensor<128x1x!tt.ptr<f32>, #blocked>, tensor<128x1xi32, #blocked> loc(#loc56)
123
+ tt.store %13, %tmp4_41, %xmask_17 : tensor<128x1x!tt.ptr<f32>, #blocked> loc(#loc57)
124
+ %14 = tt.splat %out_ptr1 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>, #blocked> loc(#loc58)
125
+ %15 = tt.addptr %14, %xindex_14 : tensor<128x1x!tt.ptr<f32>, #blocked>, tensor<128x1xi32, #blocked> loc(#loc58)
126
+ tt.store %15, %tmp16_46, %xmask_17 : tensor<128x1x!tt.ptr<f32>, #blocked> loc(#loc59)
127
+ tt.return loc(#loc60)
128
+ } loc(#loc)
129
+ } loc(#loc)
130
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":23:28)
131
+ #loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":23:33)
132
+ #loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":24:44)
133
+ #loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":24:23)
134
+ #loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":25:21)
135
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":26:38)
136
+ #loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":33:19)
137
+ #loc9 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
138
+ #loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":34:51)
139
+ #loc11 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
140
+ #loc12 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
141
+ #loc13 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
142
+ #loc14 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
143
+ #loc15 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
144
+ #loc16 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
145
+ #loc17 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
146
+ #loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":35:38)
147
+ #loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":35:42)
148
+ #loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":35:35)
149
+ #loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":35:30)
150
+ #loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":35:49)
151
+ #loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":36:30)
152
+ #loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":36:49)
153
+ #loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":38:33)
154
+ #loc26 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29)
155
+ #loc28 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15)
156
+ #loc29 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21)
157
+ #loc30 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16)
158
+ #loc31 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29)
159
+ #loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":39:40)
160
+ #loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":41:19)
161
+ #loc34 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":42:18)
162
+ #loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":44:33)
163
+ #loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":45:27)
164
+ #loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":46:19)
165
+ #loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":48:35)
166
+ #loc39 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
167
+ #loc41 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
168
+ #loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":49:29)
169
+ #loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":51:34)
170
+ #loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":52:27)
171
+ #loc45 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":53:20)
172
+ #loc46 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":55:20)
173
+ #loc47 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:49)
174
+ #loc48 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:75)
175
+ #loc49 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:66)
176
+ #loc50 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:57)
177
+ #loc51 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:41)
178
+ #loc52 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:34)
179
+ #loc53 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:30)
180
+ #loc54 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:25)
181
+ #loc55 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:89)
182
+ #loc56 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":57:25)
183
+ #loc57 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":57:36)
184
+ #loc58 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":58:25)
185
+ #loc59 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":58:37)
186
+ #loc60 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":58:4)
187
+ #loc69 = loc("xoffset"(#loc2))
188
+ #loc70 = loc("xoffset"(#loc3))
189
+ #loc71 = loc("xindex"(#loc4))
190
+ #loc72 = loc("xindex"(#loc5))
191
+ #loc73 = loc("xmask"(#loc6))
192
+ #loc74 = loc("r0_index"(#loc7))
193
+ #loc75 = loc("x2"(#loc8))
194
+ #loc76 = loc("quot"(#loc9))
195
+ #loc77 = loc("x3"(#loc10))
196
+ #loc78 = loc("fixed"(#loc11))
197
+ #loc79 = loc("fixed"(#loc12))
198
+ #loc80 = loc("fixed"(#loc13))
199
+ #loc81 = loc("tmp0"(#loc18))
200
+ #loc82 = loc("tmp0"(#loc19))
201
+ #loc83 = loc("tmp0"(#loc20))
202
+ #loc84 = loc("tmp0"(#loc21))
203
+ #loc85 = loc("tmp0"(#loc22))
204
+ #loc86 = loc("tmp5"(#loc23))
205
+ #loc87 = loc("tmp5"(#loc24))
206
+ #loc88 = loc("tmp3"(#loc25))
207
+ #loc90 = loc("mask"(#loc28))
208
+ #loc91 = loc("mask"(#loc29))
209
+ #loc92 = loc("mask"(#loc30))
210
+ #loc93 = loc("tmp4"(#loc32))
211
+ #loc94 = loc("tmp7"(#loc33))
212
+ #loc95 = loc("tmp8"(#loc34))
213
+ #loc96 = loc("tmp10"(#loc35))
214
+ #loc97 = loc("tmp11"(#loc36))
215
+ #loc98 = loc("tmp12"(#loc37))
216
+ #loc99 = loc("tmp15"(#loc38))
217
+ #loc101 = loc("tmp16"(#loc42))
218
+ #loc102 = loc("tmp18"(#loc43))
219
+ #loc103 = loc("tmp19"(#loc44))
220
+ #loc104 = loc("tmp20"(#loc45))
221
+ #loc105 = loc("tmp22"(#loc46))
222
+ #loc106 = loc(fused[#loc50, #loc51])
223
+ #loc107 = loc(callsite(#loc76 at #loc77))
224
+ #loc108 = loc(callsite(#loc78 at #loc77))
225
+ #loc109 = loc(callsite(#loc79 at #loc77))
226
+ #loc110 = loc(callsite(#loc80 at #loc77))
227
+ #loc111 = loc(callsite(#loc14 at #loc77))
228
+ #loc112 = loc(callsite(#loc15 at #loc77))
229
+ #loc113 = loc(callsite(#loc16 at #loc77))
230
+ #loc114 = loc(callsite(#loc17 at #loc77))
231
+ #loc115 = loc(callsite(#loc26 at #loc89))
232
+ #loc117 = loc("mask"(#loc90))
233
+ #loc118 = loc("mask"(#loc92))
234
+ #loc119 = loc(callsite(#loc39 at #loc100))
235
+ #loc121 = loc(callsite(#loc117 at #loc115))
236
+ #loc122 = loc(callsite(#loc91 at #loc115))
237
+ #loc123 = loc(callsite(#loc118 at #loc115))
238
+ #loc124 = loc(callsite(#loc31 at #loc115))
239
+ #loc125 = loc(callsite(#loc41 at #loc119))
progress/SpecForge/cache/compiled_kernels/triton/3/25EJS5K5XTRHFZ4RQBPDWY7APVK52Y4IKRJWVKO66CD6SDLYFUWQ/triton_per_fused_mul_1.ttir ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":18:0)
2
+ #loc6 = loc(unknown)
3
+ #loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":39:37)
4
+ #loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":49:26)
5
+ #loc63 = loc("in_ptr0"(#loc))
6
+ #loc64 = loc("in_ptr1"(#loc))
7
+ #loc65 = loc("out_ptr0"(#loc))
8
+ #loc66 = loc("out_ptr1"(#loc))
9
+ #loc67 = loc("out_ptr2"(#loc))
10
+ #loc68 = loc("ks0"(#loc))
11
+ #loc69 = loc("xnumel"(#loc))
12
+ #loc70 = loc("r0_numel"(#loc))
13
+ #loc96 = loc("tmp4"(#loc32))
14
+ #loc106 = loc("tmp16"(#loc44))
15
+ #loc120 = loc(callsite(#loc6 at #loc96))
16
+ #loc124 = loc(callsite(#loc6 at #loc106))
17
+ module {
18
+ tt.func public @triton_per_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
19
+ %fixed = arith.constant dense<1> : tensor<128x1xi64> loc(#loc111)
20
+ %x3 = arith.constant dense<0> : tensor<128x1xi32> loc(#loc112)
21
+ %fixed_0 = arith.constant dense<0> : tensor<128x1xi64> loc(#loc113)
22
+ %x3_1 = arith.constant 0 : i64 loc(#loc114)
23
+ %c1_i64 = arith.constant 1 : i64 loc(#loc6)
24
+ %tmp22 = arith.constant dense<0.693147182> : tensor<128x1xf32> loc(#loc74)
25
+ %tmp18 = arith.constant dense<1.000000e+00> : tensor<128x1xf32> loc(#loc75)
26
+ %tmp7 = arith.constant dense<0xFF800000> : tensor<128x1xf32> loc(#loc76)
27
+ %tmp3 = arith.constant dense<0xFF800000> : tensor<128x32xf32> loc(#loc77)
28
+ %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf32> loc(#loc6)
29
+ %c32_i64 = arith.constant 32 : i64 loc(#loc6)
30
+ %c128_i32 = arith.constant 128 : i32 loc(#loc6)
31
+ %xoffset = tt.get_program_id x : i32 loc(#loc78)
32
+ %xoffset_2 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc79)
33
+ %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc80)
34
+ %xindex_3 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc81)
35
+ %xindex_4 = tt.splat %xoffset_2 : i32 -> tensor<128x1xi32> loc(#loc82)
36
+ %xindex_5 = arith.addi %xindex_4, %xindex_3 : tensor<128x1xi32> loc(#loc82)
37
+ %xmask = tt.splat %xnumel : i32 -> tensor<128x1xi32> loc(#loc83)
38
+ %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<128x1xi32> loc(#loc83)
39
+ %r0_index = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc84)
40
+ %r0_index_7 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc85)
41
+ %x2 = arith.extsi %xindex_5 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc86)
42
+ %x2_8 = tt.splat %ks0 : i64 -> tensor<128x1xi64> loc(#loc86)
43
+ %x2_9 = arith.remsi %x2, %x2_8 : tensor<128x1xi64> loc(#loc86)
44
+ %quot = arith.divsi %x2, %x2_8 : tensor<128x1xi64> loc(#loc115)
45
+ %fixed_10 = arith.cmpi ne, %x2_9, %fixed_0 : tensor<128x1xi64> loc(#loc113)
46
+ %fixed_11 = arith.subi %quot, %fixed : tensor<128x1xi64> loc(#loc111)
47
+ %fixed_12 = arith.select %fixed_10, %fixed_11, %quot : tensor<128x1xi1>, tensor<128x1xi64> loc(#loc116)
48
+ %x3_13 = arith.cmpi slt, %xindex_5, %x3 : tensor<128x1xi32> loc(#loc112)
49
+ %x3_14 = arith.cmpi slt, %ks0, %x3_1 : i64 loc(#loc114)
50
+ %x3_15 = tt.splat %x3_14 : i1 -> tensor<128x1xi1> loc(#loc117)
51
+ %x3_16 = arith.cmpi ne, %x3_13, %x3_15 : tensor<128x1xi1> loc(#loc117)
52
+ %x3_17 = arith.select %x3_16, %fixed_12, %quot : tensor<128x1xi1>, tensor<128x1xi64> loc(#loc118)
53
+ %tmp0 = arith.muli %ks0, %c32_i64 : i64 loc(#loc89)
54
+ %tmp0_18 = arith.extsi %r0_index_7 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc90)
55
+ %tmp0_19 = tt.splat %tmp0 : i64 -> tensor<1x32xi64> loc(#loc90)
56
+ %tmp0_20 = arith.muli %tmp0_19, %tmp0_18 : tensor<1x32xi64> loc(#loc90)
57
+ %tmp0_21 = tt.broadcast %x2 : tensor<128x1xi64> -> tensor<128x32xi64> loc(#loc91)
58
+ %tmp0_22 = tt.broadcast %tmp0_20 : tensor<1x32xi64> -> tensor<128x32xi64> loc(#loc91)
59
+ %tmp0_23 = arith.addi %tmp0_21, %tmp0_22 : tensor<128x32xi64> loc(#loc91)
60
+ %tmp0_24 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x32x!tt.ptr<f32>> loc(#loc92)
61
+ %tmp0_25 = tt.addptr %tmp0_24, %tmp0_23 : tensor<128x32x!tt.ptr<f32>>, tensor<128x32xi64> loc(#loc92)
62
+ %tmp0_26 = tt.broadcast %xmask_6 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc93)
63
+ %tmp0_27 = tt.load %tmp0_25, %tmp0_26, %cst : tensor<128x32x!tt.ptr<f32>> loc(#loc93)
64
+ %tmp5 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<128x32x!tt.ptr<f32>> loc(#loc94)
65
+ %tmp5_28 = tt.addptr %tmp5, %tmp0_23 : tensor<128x32x!tt.ptr<f32>>, tensor<128x32xi64> loc(#loc94)
66
+ %tmp5_29 = tt.load %tmp5_28, %tmp0_26, %cst : tensor<128x32x!tt.ptr<f32>> loc(#loc95)
67
+ %tmp3_30 = arith.select %tmp0_26, %tmp0_27, %tmp3 : tensor<128x32xi1>, tensor<128x32xf32> loc(#loc77)
68
+ %tmp4 = "tt.reduce"(%tmp3_30) <{axis = 1 : i32}> ({
69
+ ^bb0(%tmp4_38: f32 loc(callsite(#loc6 at #loc96)), %tmp4_39: f32 loc(callsite(#loc6 at #loc96))):
70
+ %mask = arith.cmpf ogt, %tmp4_38, %tmp4_39 : f32 loc(#loc125)
71
+ %mask_40 = arith.cmpf une, %tmp4_38, %tmp4_38 : f32 loc(#loc126)
72
+ %mask_41 = arith.ori %mask, %mask_40 : i1 loc(#loc127)
73
+ %tmp4_42 = arith.select %mask_41, %tmp4_38, %tmp4_39 : f32 loc(#loc128)
74
+ tt.reduce.return %tmp4_42 : f32 loc(#loc119)
75
+ }) : (tensor<128x32xf32>) -> tensor<128xf32> loc(#loc119)
76
+ %tmp4_31 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc100)
77
+ %tmp7_32 = arith.cmpf oeq, %tmp4_31, %tmp7 : tensor<128x1xf32> loc(#loc76)
78
+ %tmp8 = tt.broadcast %tmp4_31 : tensor<128x1xf32> -> tensor<128x32xf32> loc(#loc101)
79
+ %tmp8_33 = arith.subf %tmp0_27, %tmp8 : tensor<128x32xf32> loc(#loc101)
80
+ %tmp10 = tt.broadcast %tmp7_32 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc102)
81
+ %tmp10_34 = arith.select %tmp10, %cst, %tmp8_33 : tensor<128x32xi1>, tensor<128x32xf32> loc(#loc102)
82
+ %tmp11 = tt.extern_elementwise %tmp10_34 {libname = "", libpath = "", pure = true, symbol = "__nv_exp2f"} : (tensor<128x32xf32>) -> tensor<128x32xf32> loc(#loc103)
83
+ %tmp12 = arith.mulf %tmp5_29, %tmp11 : tensor<128x32xf32> loc(#loc104)
84
+ %tmp15 = arith.select %tmp0_26, %tmp12, %cst : tensor<128x32xi1>, tensor<128x32xf32> loc(#loc105)
85
+ %tmp16 = "tt.reduce"(%tmp15) <{axis = 1 : i32}> ({
86
+ ^bb0(%tmp16_38: f32 loc(callsite(#loc6 at #loc106)), %tmp16_39: f32 loc(callsite(#loc6 at #loc106))):
87
+ %tmp16_40 = arith.addf %tmp16_38, %tmp16_39 : f32 loc(#loc129)
88
+ tt.reduce.return %tmp16_40 : f32 loc(#loc123)
89
+ }) : (tensor<128x32xf32>) -> tensor<128xf32> loc(#loc123)
90
+ %tmp16_35 = tt.expand_dims %tmp16 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc107)
91
+ %tmp18_36 = arith.select %tmp7_32, %tmp18, %tmp16_35 : tensor<128x1xi1>, tensor<128x1xf32> loc(#loc75)
92
+ %tmp19 = tt.extern_elementwise %tmp18_36 {libname = "", libpath = "", pure = true, symbol = "__nv_log2f"} : (tensor<128x1xf32>) -> tensor<128x1xf32> loc(#loc108)
93
+ %tmp20 = arith.addf %tmp19, %tmp4_31 : tensor<128x1xf32> loc(#loc109)
94
+ %tmp22_37 = arith.mulf %tmp20, %tmp22 : tensor<128x1xf32> loc(#loc74)
95
+ %0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc49)
96
+ %1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc50)
97
+ %2 = arith.extui %1 : i1 to i64 loc(#loc51)
98
+ %3 = arith.muli %ks0, %2 : i64 loc(#loc51)
99
+ %4 = arith.extui %0 : i1 to i64 loc(#loc110)
100
+ %5 = arith.addi %4, %3 : i64 loc(#loc52)
101
+ %6 = tt.splat %5 : i64 -> tensor<128x1xi64> loc(#loc54)
102
+ %7 = arith.muli %x3_17, %6 : tensor<128x1xi64> loc(#loc54)
103
+ %8 = arith.addi %x2_9, %7 : tensor<128x1xi64> loc(#loc55)
104
+ %9 = tt.splat %out_ptr2 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>> loc(#loc56)
105
+ %10 = tt.addptr %9, %8 : tensor<128x1x!tt.ptr<f32>>, tensor<128x1xi64> loc(#loc56)
106
+ tt.store %10, %tmp22_37, %xmask_6 : tensor<128x1x!tt.ptr<f32>> loc(#loc57)
107
+ %11 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>> loc(#loc58)
108
+ %12 = tt.addptr %11, %xindex_5 : tensor<128x1x!tt.ptr<f32>>, tensor<128x1xi32> loc(#loc58)
109
+ tt.store %12, %tmp4_31, %xmask_6 : tensor<128x1x!tt.ptr<f32>> loc(#loc59)
110
+ %13 = tt.splat %out_ptr1 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>> loc(#loc60)
111
+ %14 = tt.addptr %13, %xindex_5 : tensor<128x1x!tt.ptr<f32>>, tensor<128x1xi32> loc(#loc60)
112
+ tt.store %14, %tmp16_35, %xmask_6 : tensor<128x1x!tt.ptr<f32>> loc(#loc61)
113
+ tt.return loc(#loc62)
114
+ } loc(#loc)
115
+ } loc(#loc)
116
+ #loc1 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
117
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":34:51)
118
+ #loc3 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
119
+ #loc4 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
120
+ #loc5 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
121
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":55:20)
122
+ #loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":51:34)
123
+ #loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":41:19)
124
+ #loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":38:33)
125
+ #loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":23:28)
126
+ #loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":23:33)
127
+ #loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":24:36)
128
+ #loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":24:44)
129
+ #loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":24:23)
130
+ #loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":25:21)
131
+ #loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":26:28)
132
+ #loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":26:38)
133
+ #loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":33:19)
134
+ #loc20 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
135
+ #loc21 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
136
+ #loc22 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
137
+ #loc23 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
138
+ #loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":35:38)
139
+ #loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":35:42)
140
+ #loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":35:35)
141
+ #loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":35:30)
142
+ #loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":35:49)
143
+ #loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":36:30)
144
+ #loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":36:49)
145
+ #loc31 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29)
146
+ #loc33 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15)
147
+ #loc34 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21)
148
+ #loc35 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16)
149
+ #loc36 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29)
150
+ #loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":39:40)
151
+ #loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":42:18)
152
+ #loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":44:33)
153
+ #loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":45:27)
154
+ #loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":46:19)
155
+ #loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":48:35)
156
+ #loc43 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
157
+ #loc45 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
158
+ #loc46 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":49:29)
159
+ #loc47 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":52:27)
160
+ #loc48 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":53:20)
161
+ #loc49 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:49)
162
+ #loc50 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:75)
163
+ #loc51 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:66)
164
+ #loc52 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:57)
165
+ #loc53 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:41)
166
+ #loc54 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:34)
167
+ #loc55 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:30)
168
+ #loc56 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:25)
169
+ #loc57 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":56:89)
170
+ #loc58 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":57:25)
171
+ #loc59 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":57:36)
172
+ #loc60 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":58:25)
173
+ #loc61 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":58:37)
174
+ #loc62 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/6g/c6gb52skvqs7or57vd3zu5um3r5rnmeimd5qam27l5j7uqx7t4ai.py":58:4)
175
+ #loc71 = loc("fixed"(#loc1))
176
+ #loc72 = loc("x3"(#loc2))
177
+ #loc73 = loc("fixed"(#loc4))
178
+ #loc74 = loc("tmp22"(#loc7))
179
+ #loc75 = loc("tmp18"(#loc8))
180
+ #loc76 = loc("tmp7"(#loc9))
181
+ #loc77 = loc("tmp3"(#loc10))
182
+ #loc78 = loc("xoffset"(#loc11))
183
+ #loc79 = loc("xoffset"(#loc12))
184
+ #loc80 = loc("xindex"(#loc13))
185
+ #loc81 = loc("xindex"(#loc14))
186
+ #loc82 = loc("xindex"(#loc15))
187
+ #loc83 = loc("xmask"(#loc16))
188
+ #loc84 = loc("r0_index"(#loc17))
189
+ #loc85 = loc("r0_index"(#loc18))
190
+ #loc86 = loc("x2"(#loc19))
191
+ #loc87 = loc("quot"(#loc20))
192
+ #loc88 = loc("fixed"(#loc21))
193
+ #loc89 = loc("tmp0"(#loc24))
194
+ #loc90 = loc("tmp0"(#loc25))
195
+ #loc91 = loc("tmp0"(#loc26))
196
+ #loc92 = loc("tmp0"(#loc27))
197
+ #loc93 = loc("tmp0"(#loc28))
198
+ #loc94 = loc("tmp5"(#loc29))
199
+ #loc95 = loc("tmp5"(#loc30))
200
+ #loc97 = loc("mask"(#loc33))
201
+ #loc98 = loc("mask"(#loc34))
202
+ #loc99 = loc("mask"(#loc35))
203
+ #loc100 = loc("tmp4"(#loc37))
204
+ #loc101 = loc("tmp8"(#loc38))
205
+ #loc102 = loc("tmp10"(#loc39))
206
+ #loc103 = loc("tmp11"(#loc40))
207
+ #loc104 = loc("tmp12"(#loc41))
208
+ #loc105 = loc("tmp15"(#loc42))
209
+ #loc107 = loc("tmp16"(#loc46))
210
+ #loc108 = loc("tmp19"(#loc47))
211
+ #loc109 = loc("tmp20"(#loc48))
212
+ #loc110 = loc(fused[#loc52, #loc53])
213
+ #loc111 = loc(callsite(#loc71 at #loc72))
214
+ #loc112 = loc(callsite(#loc3 at #loc72))
215
+ #loc113 = loc(callsite(#loc73 at #loc72))
216
+ #loc114 = loc(callsite(#loc5 at #loc72))
217
+ #loc115 = loc(callsite(#loc87 at #loc72))
218
+ #loc116 = loc(callsite(#loc88 at #loc72))
219
+ #loc117 = loc(callsite(#loc22 at #loc72))
220
+ #loc118 = loc(callsite(#loc23 at #loc72))
221
+ #loc119 = loc(callsite(#loc31 at #loc96))
222
+ #loc121 = loc("mask"(#loc97))
223
+ #loc122 = loc("mask"(#loc99))
224
+ #loc123 = loc(callsite(#loc43 at #loc106))
225
+ #loc125 = loc(callsite(#loc121 at #loc119))
226
+ #loc126 = loc(callsite(#loc98 at #loc119))
227
+ #loc127 = loc(callsite(#loc122 at #loc119))
228
+ #loc128 = loc(callsite(#loc36 at #loc119))
229
+ #loc129 = loc(callsite(#loc45 at #loc123))
progress/SpecForge/cache/compiled_kernels/triton/3/2GPX4H2RCLQEV2LIJWPK5V4BGHUBOBJI3FUMPIZKQJ7CMB34OTAA/__grp__triton_per_fused_mul_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_per_fused_mul_1.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/2GPX4H2RCLQEV2LIJWPK5V4BGHUBOBJI3FUMPIZKQJ7CMB34OTAA/triton_per_fused_mul_1.source", "triton_per_fused_mul_1.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/2GPX4H2RCLQEV2LIJWPK5V4BGHUBOBJI3FUMPIZKQJ7CMB34OTAA/triton_per_fused_mul_1.ttir", "triton_per_fused_mul_1.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/2GPX4H2RCLQEV2LIJWPK5V4BGHUBOBJI3FUMPIZKQJ7CMB34OTAA/triton_per_fused_mul_1.ttgir", "triton_per_fused_mul_1.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/2GPX4H2RCLQEV2LIJWPK5V4BGHUBOBJI3FUMPIZKQJ7CMB34OTAA/triton_per_fused_mul_1.llir", "triton_per_fused_mul_1.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/2GPX4H2RCLQEV2LIJWPK5V4BGHUBOBJI3FUMPIZKQJ7CMB34OTAA/triton_per_fused_mul_1.ptx", "triton_per_fused_mul_1.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/2GPX4H2RCLQEV2LIJWPK5V4BGHUBOBJI3FUMPIZKQJ7CMB34OTAA/triton_per_fused_mul_1.cubin", "triton_per_fused_mul_1.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/2GPX4H2RCLQEV2LIJWPK5V4BGHUBOBJI3FUMPIZKQJ7CMB34OTAA/triton_per_fused_mul_1.json"}}
progress/SpecForge/cache/compiled_kernels/triton/3/2GPX4H2RCLQEV2LIJWPK5V4BGHUBOBJI3FUMPIZKQJ7CMB34OTAA/triton_per_fused_mul_1.cubin ADDED
Binary file (35.5 kB). View file
 
progress/SpecForge/cache/compiled_kernels/triton/3/2GPX4H2RCLQEV2LIJWPK5V4BGHUBOBJI3FUMPIZKQJ7CMB34OTAA/triton_per_fused_mul_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "d19f7e1f5112e04ae9684d9eaed78131e8170528d968c7a32a827e26077c74c0", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 4096, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused_mul_1"}
progress/SpecForge/cache/compiled_kernels/triton/3/2GPX4H2RCLQEV2LIJWPK5V4BGHUBOBJI3FUMPIZKQJ7CMB34OTAA/triton_per_fused_mul_1.llir ADDED
@@ -0,0 +1,1153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+ target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
4
+
5
+ @global_smem = external addrspace(3) global [0 x i8], align 16
6
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
7
+
8
+ ; Function Attrs: nounwind
9
+ define ptx_kernel void @triton_per_fused_mul_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6, ptr addrspace(1) readnone captures(none) %7, ptr addrspace(1) readnone captures(none) %8) local_unnamed_addr #0 !dbg !5 {
10
+ %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8
11
+ %11 = shl i32 %10, 7, !dbg !9
12
+ %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
13
+ %13 = lshr i32 %12, 5, !dbg !10
14
+ %14 = and i32 %12, 31, !dbg !10
15
+ %15 = shl nuw nsw i32 %14, 2, !dbg !10
16
+ %16 = or disjoint i32 %15, %11, !dbg !11
17
+ %17 = icmp slt i32 %16, 1536, !dbg !12
18
+ %18 = and i32 %13, 7, !dbg !13
19
+ %19 = or i32 %13, 24, !dbg !13
20
+ %20 = mul nuw nsw i32 %18, 1536, !dbg !14
21
+ %21 = add nuw nsw i32 %20, 12288, !dbg !14
22
+ %22 = add nuw nsw i32 %20, 24576, !dbg !14
23
+ %23 = mul nuw nsw i32 %19, 1536, !dbg !14
24
+ %24 = add i32 %16, %20, !dbg !15
25
+ %25 = add i32 %21, %16, !dbg !15
26
+ %26 = add i32 %22, %16, !dbg !15
27
+ %27 = add i32 %16, %23, !dbg !15
28
+ %28 = sext i32 %24 to i64, !dbg !16
29
+ %29 = getelementptr float, ptr addrspace(1) %0, i64 %28, !dbg !16
30
+ %30 = sext i32 %25 to i64, !dbg !16
31
+ %31 = getelementptr float, ptr addrspace(1) %0, i64 %30, !dbg !16
32
+ %32 = sext i32 %26 to i64, !dbg !16
33
+ %33 = getelementptr float, ptr addrspace(1) %0, i64 %32, !dbg !16
34
+ %34 = sext i32 %27 to i64, !dbg !16
35
+ %35 = getelementptr float, ptr addrspace(1) %0, i64 %34, !dbg !16
36
+ %36 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %29, i1 %17) #6, !dbg !17
37
+ %37 = extractvalue { i32, i32, i32, i32 } %36, 0, !dbg !17
38
+ %38 = extractvalue { i32, i32, i32, i32 } %36, 1, !dbg !17
39
+ %39 = extractvalue { i32, i32, i32, i32 } %36, 2, !dbg !17
40
+ %40 = extractvalue { i32, i32, i32, i32 } %36, 3, !dbg !17
41
+ %41 = bitcast i32 %37 to float, !dbg !17
42
+ %42 = bitcast i32 %38 to float, !dbg !17
43
+ %43 = bitcast i32 %39 to float, !dbg !17
44
+ %44 = bitcast i32 %40 to float, !dbg !17
45
+ %45 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %31, i1 %17) #6, !dbg !17
46
+ %46 = extractvalue { i32, i32, i32, i32 } %45, 0, !dbg !17
47
+ %47 = extractvalue { i32, i32, i32, i32 } %45, 1, !dbg !17
48
+ %48 = extractvalue { i32, i32, i32, i32 } %45, 2, !dbg !17
49
+ %49 = extractvalue { i32, i32, i32, i32 } %45, 3, !dbg !17
50
+ %50 = bitcast i32 %46 to float, !dbg !17
51
+ %51 = bitcast i32 %47 to float, !dbg !17
52
+ %52 = bitcast i32 %48 to float, !dbg !17
53
+ %53 = bitcast i32 %49 to float, !dbg !17
54
+ %54 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %33, i1 %17) #6, !dbg !17
55
+ %55 = extractvalue { i32, i32, i32, i32 } %54, 0, !dbg !17
56
+ %56 = extractvalue { i32, i32, i32, i32 } %54, 1, !dbg !17
57
+ %57 = extractvalue { i32, i32, i32, i32 } %54, 2, !dbg !17
58
+ %58 = extractvalue { i32, i32, i32, i32 } %54, 3, !dbg !17
59
+ %59 = bitcast i32 %55 to float, !dbg !17
60
+ %60 = bitcast i32 %56 to float, !dbg !17
61
+ %61 = bitcast i32 %57 to float, !dbg !17
62
+ %62 = bitcast i32 %58 to float, !dbg !17
63
+ %63 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %35, i1 %17) #6, !dbg !17
64
+ %64 = extractvalue { i32, i32, i32, i32 } %63, 0, !dbg !17
65
+ %65 = extractvalue { i32, i32, i32, i32 } %63, 1, !dbg !17
66
+ %66 = extractvalue { i32, i32, i32, i32 } %63, 2, !dbg !17
67
+ %67 = extractvalue { i32, i32, i32, i32 } %63, 3, !dbg !17
68
+ %68 = bitcast i32 %64 to float, !dbg !17
69
+ %69 = bitcast i32 %65 to float, !dbg !17
70
+ %70 = bitcast i32 %66 to float, !dbg !17
71
+ %71 = bitcast i32 %67 to float, !dbg !17
72
+ %72 = getelementptr float, ptr addrspace(1) %1, i64 %28, !dbg !18
73
+ %73 = getelementptr float, ptr addrspace(1) %1, i64 %30, !dbg !18
74
+ %74 = getelementptr float, ptr addrspace(1) %1, i64 %32, !dbg !18
75
+ %75 = getelementptr float, ptr addrspace(1) %1, i64 %34, !dbg !18
76
+ %76 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %72, i1 %17) #6, !dbg !19
77
+ %77 = extractvalue { i32, i32, i32, i32 } %76, 0, !dbg !19
78
+ %78 = extractvalue { i32, i32, i32, i32 } %76, 1, !dbg !19
79
+ %79 = extractvalue { i32, i32, i32, i32 } %76, 2, !dbg !19
80
+ %80 = extractvalue { i32, i32, i32, i32 } %76, 3, !dbg !19
81
+ %81 = bitcast i32 %77 to float, !dbg !19
82
+ %82 = bitcast i32 %78 to float, !dbg !19
83
+ %83 = bitcast i32 %79 to float, !dbg !19
84
+ %84 = bitcast i32 %80 to float, !dbg !19
85
+ %85 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %73, i1 %17) #6, !dbg !19
86
+ %86 = extractvalue { i32, i32, i32, i32 } %85, 0, !dbg !19
87
+ %87 = extractvalue { i32, i32, i32, i32 } %85, 1, !dbg !19
88
+ %88 = extractvalue { i32, i32, i32, i32 } %85, 2, !dbg !19
89
+ %89 = extractvalue { i32, i32, i32, i32 } %85, 3, !dbg !19
90
+ %90 = bitcast i32 %86 to float, !dbg !19
91
+ %91 = bitcast i32 %87 to float, !dbg !19
92
+ %92 = bitcast i32 %88 to float, !dbg !19
93
+ %93 = bitcast i32 %89 to float, !dbg !19
94
+ %94 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %74, i1 %17) #6, !dbg !19
95
+ %95 = extractvalue { i32, i32, i32, i32 } %94, 0, !dbg !19
96
+ %96 = extractvalue { i32, i32, i32, i32 } %94, 1, !dbg !19
97
+ %97 = extractvalue { i32, i32, i32, i32 } %94, 2, !dbg !19
98
+ %98 = extractvalue { i32, i32, i32, i32 } %94, 3, !dbg !19
99
+ %99 = bitcast i32 %95 to float, !dbg !19
100
+ %100 = bitcast i32 %96 to float, !dbg !19
101
+ %101 = bitcast i32 %97 to float, !dbg !19
102
+ %102 = bitcast i32 %98 to float, !dbg !19
103
+ %103 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %75, i1 %17) #6, !dbg !19
104
+ %104 = extractvalue { i32, i32, i32, i32 } %103, 0, !dbg !19
105
+ %105 = extractvalue { i32, i32, i32, i32 } %103, 1, !dbg !19
106
+ %106 = extractvalue { i32, i32, i32, i32 } %103, 2, !dbg !19
107
+ %107 = extractvalue { i32, i32, i32, i32 } %103, 3, !dbg !19
108
+ %108 = bitcast i32 %104 to float, !dbg !19
109
+ %109 = bitcast i32 %105 to float, !dbg !19
110
+ %110 = bitcast i32 %106 to float, !dbg !19
111
+ %111 = bitcast i32 %107 to float, !dbg !19
112
+ %112 = select i1 %17, float %41, float 0xFFF0000000000000, !dbg !20
113
+ %113 = select i1 %17, float %42, float 0xFFF0000000000000, !dbg !20
114
+ %114 = select i1 %17, float %43, float 0xFFF0000000000000, !dbg !20
115
+ %115 = select i1 %17, float %44, float 0xFFF0000000000000, !dbg !20
116
+ %116 = select i1 %17, float %50, float 0xFFF0000000000000, !dbg !20
117
+ %117 = select i1 %17, float %51, float 0xFFF0000000000000, !dbg !20
118
+ %118 = select i1 %17, float %52, float 0xFFF0000000000000, !dbg !20
119
+ %119 = select i1 %17, float %53, float 0xFFF0000000000000, !dbg !20
120
+ %120 = select i1 %17, float %59, float 0xFFF0000000000000, !dbg !20
121
+ %121 = select i1 %17, float %60, float 0xFFF0000000000000, !dbg !20
122
+ %122 = select i1 %17, float %61, float 0xFFF0000000000000, !dbg !20
123
+ %123 = select i1 %17, float %62, float 0xFFF0000000000000, !dbg !20
124
+ %124 = select i1 %17, float %68, float 0xFFF0000000000000, !dbg !20
125
+ %125 = select i1 %17, float %69, float 0xFFF0000000000000, !dbg !20
126
+ %126 = select i1 %17, float %70, float 0xFFF0000000000000, !dbg !20
127
+ %127 = select i1 %17, float %71, float 0xFFF0000000000000, !dbg !20
128
+ %128 = fcmp ogt float %112, %116, !dbg !21
129
+ %129 = fcmp uno float %112, 0.000000e+00, !dbg !25
130
+ %130 = or i1 %129, %128, !dbg !26
131
+ %131 = select i1 %130, float %112, float %116, !dbg !27
132
+ %132 = fcmp ogt float %113, %117, !dbg !21
133
+ %133 = fcmp uno float %113, 0.000000e+00, !dbg !25
134
+ %134 = or i1 %133, %132, !dbg !26
135
+ %135 = select i1 %134, float %113, float %117, !dbg !27
136
+ %136 = fcmp ogt float %114, %118, !dbg !21
137
+ %137 = fcmp uno float %114, 0.000000e+00, !dbg !25
138
+ %138 = or i1 %137, %136, !dbg !26
139
+ %139 = select i1 %138, float %114, float %118, !dbg !27
140
+ %140 = fcmp ogt float %115, %119, !dbg !21
141
+ %141 = fcmp uno float %115, 0.000000e+00, !dbg !25
142
+ %142 = or i1 %141, %140, !dbg !26
143
+ %143 = select i1 %142, float %115, float %119, !dbg !27
144
+ %144 = fcmp ogt float %131, %120, !dbg !21
145
+ %145 = fcmp uno float %131, 0.000000e+00, !dbg !25
146
+ %146 = or i1 %144, %145, !dbg !26
147
+ %147 = select i1 %146, float %131, float %120, !dbg !27
148
+ %148 = fcmp ogt float %135, %121, !dbg !21
149
+ %149 = fcmp uno float %135, 0.000000e+00, !dbg !25
150
+ %150 = or i1 %148, %149, !dbg !26
151
+ %151 = select i1 %150, float %135, float %121, !dbg !27
152
+ %152 = fcmp ogt float %139, %122, !dbg !21
153
+ %153 = fcmp uno float %139, 0.000000e+00, !dbg !25
154
+ %154 = or i1 %152, %153, !dbg !26
155
+ %155 = select i1 %154, float %139, float %122, !dbg !27
156
+ %156 = fcmp ogt float %143, %123, !dbg !21
157
+ %157 = fcmp uno float %143, 0.000000e+00, !dbg !25
158
+ %158 = or i1 %156, %157, !dbg !26
159
+ %159 = select i1 %158, float %143, float %123, !dbg !27
160
+ %160 = fcmp ogt float %147, %124, !dbg !21
161
+ %161 = fcmp uno float %147, 0.000000e+00, !dbg !25
162
+ %162 = or i1 %160, %161, !dbg !26
163
+ %163 = select i1 %162, float %147, float %124, !dbg !27
164
+ %164 = fcmp ogt float %151, %125, !dbg !21
165
+ %165 = fcmp uno float %151, 0.000000e+00, !dbg !25
166
+ %166 = or i1 %164, %165, !dbg !26
167
+ %167 = select i1 %166, float %151, float %125, !dbg !27
168
+ %168 = fcmp ogt float %155, %126, !dbg !21
169
+ %169 = fcmp uno float %155, 0.000000e+00, !dbg !25
170
+ %170 = or i1 %168, %169, !dbg !26
171
+ %171 = select i1 %170, float %155, float %126, !dbg !27
172
+ %172 = fcmp ogt float %159, %127, !dbg !21
173
+ %173 = fcmp uno float %159, 0.000000e+00, !dbg !25
174
+ %174 = or i1 %172, %173, !dbg !26
175
+ %175 = select i1 %174, float %159, float %127, !dbg !27
176
+ %.idx = shl nuw nsw i32 %14, 7, !dbg !28
177
+ %176 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx, !dbg !28
178
+ %177 = getelementptr float, ptr addrspace(3) %176, i32 %18, !dbg !28
179
+ %178 = bitcast float %163 to <1 x i32>, !dbg !28
180
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %177, <1 x i32> %178, i1 true) #6, !dbg !28
181
+ %179 = getelementptr i8, ptr addrspace(3) %176, i32 32, !dbg !28
182
+ %180 = getelementptr float, ptr addrspace(3) %179, i32 %18, !dbg !28
183
+ %181 = bitcast float %167 to <1 x i32>, !dbg !28
184
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %180, <1 x i32> %181, i1 true) #6, !dbg !28
185
+ %182 = getelementptr i8, ptr addrspace(3) %176, i32 64, !dbg !28
186
+ %183 = getelementptr float, ptr addrspace(3) %182, i32 %18, !dbg !28
187
+ %184 = bitcast float %171 to <1 x i32>, !dbg !28
188
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %183, <1 x i32> %184, i1 true) #6, !dbg !28
189
+ %185 = getelementptr i8, ptr addrspace(3) %176, i32 96, !dbg !28
190
+ %186 = getelementptr float, ptr addrspace(3) %185, i32 %18, !dbg !28
191
+ %187 = bitcast float %175 to <1 x i32>, !dbg !28
192
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %186, <1 x i32> %187, i1 true) #6, !dbg !28
193
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28
194
+ %188 = getelementptr float, ptr addrspace(3) @global_smem, i32 %12, !dbg !28
195
+ %189 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %188, i1 true) #6, !dbg !28
196
+ %190 = bitcast i32 %189 to float, !dbg !28
197
+ %191 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %189, i32 4, i32 31), !dbg !28
198
+ %192 = bitcast i32 %191 to float, !dbg !28
199
+ %193 = fcmp ogt float %190, %192, !dbg !21
200
+ %194 = fcmp uno float %190, 0.000000e+00, !dbg !25
201
+ %195 = or i1 %194, %193, !dbg !26
202
+ %196 = select i1 %195, float %190, float %192, !dbg !27
203
+ %197 = bitcast float %196 to i32, !dbg !28
204
+ %198 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %197, i32 2, i32 31), !dbg !28
205
+ %199 = bitcast i32 %198 to float, !dbg !28
206
+ %200 = fcmp ogt float %196, %199, !dbg !21
207
+ %201 = fcmp uno float %196, 0.000000e+00, !dbg !25
208
+ %202 = or i1 %200, %201, !dbg !26
209
+ %203 = select i1 %202, float %196, float %199, !dbg !27
210
+ %204 = bitcast float %203 to i32, !dbg !28
211
+ %205 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %204, i32 1, i32 31), !dbg !28
212
+ %206 = bitcast i32 %205 to float, !dbg !28
213
+ %207 = fcmp ogt float %203, %206, !dbg !21
214
+ %208 = fcmp uno float %203, 0.000000e+00, !dbg !25
215
+ %209 = or i1 %207, %208, !dbg !26
216
+ %210 = and i32 %12, 7, !dbg !28
217
+ %211 = icmp eq i32 %210, 0, !dbg !28
218
+ %212 = select i1 %209, i32 %204, i32 %205, !dbg !27
219
+ %213 = insertelement <1 x i32> poison, i32 %212, i64 0, !dbg !28
220
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %213, i1 %211) #6, !dbg !28
221
+ %214 = getelementptr i8, ptr addrspace(3) %188, i32 1024, !dbg !28
222
+ %215 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %214, i1 true) #6, !dbg !28
223
+ %216 = bitcast i32 %215 to float, !dbg !28
224
+ %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %215, i32 4, i32 31), !dbg !28
225
+ %218 = bitcast i32 %217 to float, !dbg !28
226
+ %219 = fcmp ogt float %216, %218, !dbg !21
227
+ %220 = fcmp uno float %216, 0.000000e+00, !dbg !25
228
+ %221 = or i1 %220, %219, !dbg !26
229
+ %222 = select i1 %221, float %216, float %218, !dbg !27
230
+ %223 = bitcast float %222 to i32, !dbg !28
231
+ %224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %223, i32 2, i32 31), !dbg !28
232
+ %225 = bitcast i32 %224 to float, !dbg !28
233
+ %226 = fcmp ogt float %222, %225, !dbg !21
234
+ %227 = fcmp uno float %222, 0.000000e+00, !dbg !25
235
+ %228 = or i1 %226, %227, !dbg !26
236
+ %229 = select i1 %228, float %222, float %225, !dbg !27
237
+ %230 = bitcast float %229 to i32, !dbg !28
238
+ %231 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %230, i32 1, i32 31), !dbg !28
239
+ %232 = bitcast i32 %231 to float, !dbg !28
240
+ %233 = fcmp ogt float %229, %232, !dbg !21
241
+ %234 = fcmp uno float %229, 0.000000e+00, !dbg !25
242
+ %235 = or i1 %233, %234, !dbg !26
243
+ %236 = select i1 %235, i32 %230, i32 %231, !dbg !27
244
+ %237 = insertelement <1 x i32> poison, i32 %236, i64 0, !dbg !28
245
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %214, <1 x i32> %237, i1 %211) #6, !dbg !28
246
+ %238 = getelementptr i8, ptr addrspace(3) %188, i32 2048, !dbg !28
247
+ %239 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %238, i1 true) #6, !dbg !28
248
+ %240 = bitcast i32 %239 to float, !dbg !28
249
+ %241 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %239, i32 4, i32 31), !dbg !28
250
+ %242 = bitcast i32 %241 to float, !dbg !28
251
+ %243 = fcmp ogt float %240, %242, !dbg !21
252
+ %244 = fcmp uno float %240, 0.000000e+00, !dbg !25
253
+ %245 = or i1 %244, %243, !dbg !26
254
+ %246 = select i1 %245, float %240, float %242, !dbg !27
255
+ %247 = bitcast float %246 to i32, !dbg !28
256
+ %248 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %247, i32 2, i32 31), !dbg !28
257
+ %249 = bitcast i32 %248 to float, !dbg !28
258
+ %250 = fcmp ogt float %246, %249, !dbg !21
259
+ %251 = fcmp uno float %246, 0.000000e+00, !dbg !25
260
+ %252 = or i1 %250, %251, !dbg !26
261
+ %253 = select i1 %252, float %246, float %249, !dbg !27
262
+ %254 = bitcast float %253 to i32, !dbg !28
263
+ %255 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %254, i32 1, i32 31), !dbg !28
264
+ %256 = bitcast i32 %255 to float, !dbg !28
265
+ %257 = fcmp ogt float %253, %256, !dbg !21
266
+ %258 = fcmp uno float %253, 0.000000e+00, !dbg !25
267
+ %259 = or i1 %257, %258, !dbg !26
268
+ %260 = select i1 %259, i32 %254, i32 %255, !dbg !27
269
+ %261 = insertelement <1 x i32> poison, i32 %260, i64 0, !dbg !28
270
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %238, <1 x i32> %261, i1 %211) #6, !dbg !28
271
+ %262 = getelementptr i8, ptr addrspace(3) %188, i32 3072, !dbg !28
272
+ %263 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %262, i1 true) #6, !dbg !28
273
+ %264 = bitcast i32 %263 to float, !dbg !28
274
+ %265 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %263, i32 4, i32 31), !dbg !28
275
+ %266 = bitcast i32 %265 to float, !dbg !28
276
+ %267 = fcmp ogt float %264, %266, !dbg !21
277
+ %268 = fcmp uno float %264, 0.000000e+00, !dbg !25
278
+ %269 = or i1 %268, %267, !dbg !26
279
+ %270 = select i1 %269, float %264, float %266, !dbg !27
280
+ %271 = bitcast float %270 to i32, !dbg !28
281
+ %272 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %271, i32 2, i32 31), !dbg !28
282
+ %273 = bitcast i32 %272 to float, !dbg !28
283
+ %274 = fcmp ogt float %270, %273, !dbg !21
284
+ %275 = fcmp uno float %270, 0.000000e+00, !dbg !25
285
+ %276 = or i1 %274, %275, !dbg !26
286
+ %277 = select i1 %276, float %270, float %273, !dbg !27
287
+ %278 = bitcast float %277 to i32, !dbg !28
288
+ %279 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %278, i32 1, i32 31), !dbg !28
289
+ %280 = bitcast i32 %279 to float, !dbg !28
290
+ %281 = fcmp ogt float %277, %280, !dbg !21
291
+ %282 = fcmp uno float %277, 0.000000e+00, !dbg !25
292
+ %283 = or i1 %281, %282, !dbg !26
293
+ %284 = select i1 %283, i32 %278, i32 %279, !dbg !27
294
+ %285 = insertelement <1 x i32> poison, i32 %284, i64 0, !dbg !28
295
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %262, <1 x i32> %285, i1 %211) #6, !dbg !28
296
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28
297
+ %286 = load float, ptr addrspace(3) %176, align 16, !dbg !28
298
+ %287 = load float, ptr addrspace(3) %179, align 16, !dbg !28
299
+ %288 = load float, ptr addrspace(3) %182, align 16, !dbg !28
300
+ %289 = load float, ptr addrspace(3) %185, align 16, !dbg !28
301
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !29
302
+ %290 = shl nuw nsw i32 %14, 4, !dbg !29
303
+ %291 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %290, !dbg !29
304
+ %292 = insertelement <4 x float> poison, float %286, i64 0, !dbg !29
305
+ %293 = insertelement <4 x float> %292, float %287, i64 1, !dbg !29
306
+ %294 = insertelement <4 x float> %293, float %288, i64 2, !dbg !29
307
+ %295 = insertelement <4 x float> %294, float %289, i64 3, !dbg !29
308
+ store <4 x float> %295, ptr addrspace(3) %291, align 16, !dbg !29
309
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !29
310
+ %296 = shl nuw nsw i32 %13, 7, !dbg !29
311
+ %297 = shl nuw nsw i32 %12, 4, !dbg !29
312
+ %298 = or i32 %296, %297, !dbg !29
313
+ %299 = and i32 %298, 496, !dbg !29
314
+ %300 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %299, !dbg !29
315
+ %301 = ptrtoint ptr addrspace(3) %300 to i32, !dbg !29
316
+ %302 = tail call i32 asm sideeffect "ldmatrix.sync.aligned.m8n8.x1.shared.b16 {$0}, [$1];", "=r,r"(i32 %301) #6, !dbg !29
317
+ %303 = fcmp oeq float %286, 0xFFF0000000000000, !dbg !30
318
+ %304 = fcmp oeq float %287, 0xFFF0000000000000, !dbg !30
319
+ %305 = fcmp oeq float %288, 0xFFF0000000000000, !dbg !30
320
+ %306 = fcmp oeq float %289, 0xFFF0000000000000, !dbg !30
321
+ %307 = fsub float %41, %286, !dbg !31
322
+ %308 = fsub float %42, %287, !dbg !31
323
+ %309 = fsub float %43, %288, !dbg !31
324
+ %310 = fsub float %44, %289, !dbg !31
325
+ %311 = fsub float %50, %286, !dbg !31
326
+ %312 = fsub float %51, %287, !dbg !31
327
+ %313 = fsub float %52, %288, !dbg !31
328
+ %314 = fsub float %53, %289, !dbg !31
329
+ %315 = fsub float %59, %286, !dbg !31
330
+ %316 = fsub float %60, %287, !dbg !31
331
+ %317 = fsub float %61, %288, !dbg !31
332
+ %318 = fsub float %62, %289, !dbg !31
333
+ %319 = fsub float %68, %286, !dbg !31
334
+ %320 = fsub float %69, %287, !dbg !31
335
+ %321 = fsub float %70, %288, !dbg !31
336
+ %322 = fsub float %71, %289, !dbg !31
337
+ %323 = select i1 %303, float 0.000000e+00, float %307, !dbg !32
338
+ %324 = select i1 %304, float 0.000000e+00, float %308, !dbg !32
339
+ %325 = select i1 %305, float 0.000000e+00, float %309, !dbg !32
340
+ %326 = select i1 %306, float 0.000000e+00, float %310, !dbg !32
341
+ %327 = select i1 %303, float 0.000000e+00, float %311, !dbg !32
342
+ %328 = select i1 %304, float 0.000000e+00, float %312, !dbg !32
343
+ %329 = select i1 %305, float 0.000000e+00, float %313, !dbg !32
344
+ %330 = select i1 %306, float 0.000000e+00, float %314, !dbg !32
345
+ %331 = select i1 %303, float 0.000000e+00, float %315, !dbg !32
346
+ %332 = select i1 %304, float 0.000000e+00, float %316, !dbg !32
347
+ %333 = select i1 %305, float 0.000000e+00, float %317, !dbg !32
348
+ %334 = select i1 %306, float 0.000000e+00, float %318, !dbg !32
349
+ %335 = select i1 %303, float 0.000000e+00, float %319, !dbg !32
350
+ %336 = select i1 %304, float 0.000000e+00, float %320, !dbg !32
351
+ %337 = select i1 %305, float 0.000000e+00, float %321, !dbg !32
352
+ %338 = select i1 %306, float 0.000000e+00, float %322, !dbg !32
353
+ %339 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
354
+ %.not.i = icmp eq i32 %339, 0, !dbg !33
355
+ br i1 %.not.i, label %342, label %340, !dbg !33
356
+
357
+ 340: ; preds = %9
358
+ %341 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %323) #6, !dbg !33
359
+ br label %__nv_exp2f.exit, !dbg !33
360
+
361
+ 342: ; preds = %9
362
+ %343 = tail call float @llvm.nvvm.ex2.approx.f(float %323) #6, !dbg !33
363
+ br label %__nv_exp2f.exit, !dbg !33
364
+
365
+ __nv_exp2f.exit: ; preds = %340, %342
366
+ %.0.i = phi float [ %341, %340 ], [ %343, %342 ], !dbg !33
367
+ %344 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
368
+ %.not.i1 = icmp eq i32 %344, 0, !dbg !33
369
+ br i1 %.not.i1, label %347, label %345, !dbg !33
370
+
371
+ 345: ; preds = %__nv_exp2f.exit
372
+ %346 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %324) #6, !dbg !33
373
+ br label %__nv_exp2f.exit3, !dbg !33
374
+
375
+ 347: ; preds = %__nv_exp2f.exit
376
+ %348 = tail call float @llvm.nvvm.ex2.approx.f(float %324) #6, !dbg !33
377
+ br label %__nv_exp2f.exit3, !dbg !33
378
+
379
+ __nv_exp2f.exit3: ; preds = %345, %347
380
+ %.0.i2 = phi float [ %346, %345 ], [ %348, %347 ], !dbg !33
381
+ %349 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
382
+ %.not.i4 = icmp eq i32 %349, 0, !dbg !33
383
+ br i1 %.not.i4, label %352, label %350, !dbg !33
384
+
385
+ 350: ; preds = %__nv_exp2f.exit3
386
+ %351 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %325) #6, !dbg !33
387
+ br label %__nv_exp2f.exit6, !dbg !33
388
+
389
+ 352: ; preds = %__nv_exp2f.exit3
390
+ %353 = tail call float @llvm.nvvm.ex2.approx.f(float %325) #6, !dbg !33
391
+ br label %__nv_exp2f.exit6, !dbg !33
392
+
393
+ __nv_exp2f.exit6: ; preds = %350, %352
394
+ %.0.i5 = phi float [ %351, %350 ], [ %353, %352 ], !dbg !33
395
+ %354 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
396
+ %.not.i7 = icmp eq i32 %354, 0, !dbg !33
397
+ br i1 %.not.i7, label %357, label %355, !dbg !33
398
+
399
+ 355: ; preds = %__nv_exp2f.exit6
400
+ %356 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %326) #6, !dbg !33
401
+ br label %__nv_exp2f.exit9, !dbg !33
402
+
403
+ 357: ; preds = %__nv_exp2f.exit6
404
+ %358 = tail call float @llvm.nvvm.ex2.approx.f(float %326) #6, !dbg !33
405
+ br label %__nv_exp2f.exit9, !dbg !33
406
+
407
+ __nv_exp2f.exit9: ; preds = %355, %357
408
+ %.0.i8 = phi float [ %356, %355 ], [ %358, %357 ], !dbg !33
409
+ %359 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
410
+ %.not.i10 = icmp eq i32 %359, 0, !dbg !33
411
+ br i1 %.not.i10, label %362, label %360, !dbg !33
412
+
413
+ 360: ; preds = %__nv_exp2f.exit9
414
+ %361 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %327) #6, !dbg !33
415
+ br label %__nv_exp2f.exit12, !dbg !33
416
+
417
+ 362: ; preds = %__nv_exp2f.exit9
418
+ %363 = tail call float @llvm.nvvm.ex2.approx.f(float %327) #6, !dbg !33
419
+ br label %__nv_exp2f.exit12, !dbg !33
420
+
421
+ __nv_exp2f.exit12: ; preds = %360, %362
422
+ %.0.i11 = phi float [ %361, %360 ], [ %363, %362 ], !dbg !33
423
+ %364 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
424
+ %.not.i13 = icmp eq i32 %364, 0, !dbg !33
425
+ br i1 %.not.i13, label %367, label %365, !dbg !33
426
+
427
+ 365: ; preds = %__nv_exp2f.exit12
428
+ %366 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %328) #6, !dbg !33
429
+ br label %__nv_exp2f.exit15, !dbg !33
430
+
431
+ 367: ; preds = %__nv_exp2f.exit12
432
+ %368 = tail call float @llvm.nvvm.ex2.approx.f(float %328) #6, !dbg !33
433
+ br label %__nv_exp2f.exit15, !dbg !33
434
+
435
+ __nv_exp2f.exit15: ; preds = %365, %367
436
+ %.0.i14 = phi float [ %366, %365 ], [ %368, %367 ], !dbg !33
437
+ %369 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
438
+ %.not.i16 = icmp eq i32 %369, 0, !dbg !33
439
+ br i1 %.not.i16, label %372, label %370, !dbg !33
440
+
441
+ 370: ; preds = %__nv_exp2f.exit15
442
+ %371 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %329) #6, !dbg !33
443
+ br label %__nv_exp2f.exit18, !dbg !33
444
+
445
+ 372: ; preds = %__nv_exp2f.exit15
446
+ %373 = tail call float @llvm.nvvm.ex2.approx.f(float %329) #6, !dbg !33
447
+ br label %__nv_exp2f.exit18, !dbg !33
448
+
449
+ __nv_exp2f.exit18: ; preds = %370, %372
450
+ %.0.i17 = phi float [ %371, %370 ], [ %373, %372 ], !dbg !33
451
+ %374 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
452
+ %.not.i19 = icmp eq i32 %374, 0, !dbg !33
453
+ br i1 %.not.i19, label %377, label %375, !dbg !33
454
+
455
+ 375: ; preds = %__nv_exp2f.exit18
456
+ %376 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %330) #6, !dbg !33
457
+ br label %__nv_exp2f.exit21, !dbg !33
458
+
459
+ 377: ; preds = %__nv_exp2f.exit18
460
+ %378 = tail call float @llvm.nvvm.ex2.approx.f(float %330) #6, !dbg !33
461
+ br label %__nv_exp2f.exit21, !dbg !33
462
+
463
+ __nv_exp2f.exit21: ; preds = %375, %377
464
+ %.0.i20 = phi float [ %376, %375 ], [ %378, %377 ], !dbg !33
465
+ %379 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
466
+ %.not.i22 = icmp eq i32 %379, 0, !dbg !33
467
+ br i1 %.not.i22, label %382, label %380, !dbg !33
468
+
469
+ 380: ; preds = %__nv_exp2f.exit21
470
+ %381 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %331) #6, !dbg !33
471
+ br label %__nv_exp2f.exit24, !dbg !33
472
+
473
+ 382: ; preds = %__nv_exp2f.exit21
474
+ %383 = tail call float @llvm.nvvm.ex2.approx.f(float %331) #6, !dbg !33
475
+ br label %__nv_exp2f.exit24, !dbg !33
476
+
477
+ __nv_exp2f.exit24: ; preds = %380, %382
478
+ %.0.i23 = phi float [ %381, %380 ], [ %383, %382 ], !dbg !33
479
+ %384 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
480
+ %.not.i25 = icmp eq i32 %384, 0, !dbg !33
481
+ br i1 %.not.i25, label %387, label %385, !dbg !33
482
+
483
+ 385: ; preds = %__nv_exp2f.exit24
484
+ %386 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %332) #6, !dbg !33
485
+ br label %__nv_exp2f.exit27, !dbg !33
486
+
487
+ 387: ; preds = %__nv_exp2f.exit24
488
+ %388 = tail call float @llvm.nvvm.ex2.approx.f(float %332) #6, !dbg !33
489
+ br label %__nv_exp2f.exit27, !dbg !33
490
+
491
+ __nv_exp2f.exit27: ; preds = %385, %387
492
+ %.0.i26 = phi float [ %386, %385 ], [ %388, %387 ], !dbg !33
493
+ %389 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
494
+ %.not.i28 = icmp eq i32 %389, 0, !dbg !33
495
+ br i1 %.not.i28, label %392, label %390, !dbg !33
496
+
497
+ 390: ; preds = %__nv_exp2f.exit27
498
+ %391 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %333) #6, !dbg !33
499
+ br label %__nv_exp2f.exit30, !dbg !33
500
+
501
+ 392: ; preds = %__nv_exp2f.exit27
502
+ %393 = tail call float @llvm.nvvm.ex2.approx.f(float %333) #6, !dbg !33
503
+ br label %__nv_exp2f.exit30, !dbg !33
504
+
505
+ __nv_exp2f.exit30: ; preds = %390, %392
506
+ %.0.i29 = phi float [ %391, %390 ], [ %393, %392 ], !dbg !33
507
+ %394 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
508
+ %.not.i31 = icmp eq i32 %394, 0, !dbg !33
509
+ br i1 %.not.i31, label %397, label %395, !dbg !33
510
+
511
+ 395: ; preds = %__nv_exp2f.exit30
512
+ %396 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %334) #6, !dbg !33
513
+ br label %__nv_exp2f.exit33, !dbg !33
514
+
515
+ 397: ; preds = %__nv_exp2f.exit30
516
+ %398 = tail call float @llvm.nvvm.ex2.approx.f(float %334) #6, !dbg !33
517
+ br label %__nv_exp2f.exit33, !dbg !33
518
+
519
+ __nv_exp2f.exit33: ; preds = %395, %397
520
+ %.0.i32 = phi float [ %396, %395 ], [ %398, %397 ], !dbg !33
521
+ %399 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
522
+ %.not.i34 = icmp eq i32 %399, 0, !dbg !33
523
+ br i1 %.not.i34, label %402, label %400, !dbg !33
524
+
525
+ 400: ; preds = %__nv_exp2f.exit33
526
+ %401 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %335) #6, !dbg !33
527
+ br label %__nv_exp2f.exit36, !dbg !33
528
+
529
+ 402: ; preds = %__nv_exp2f.exit33
530
+ %403 = tail call float @llvm.nvvm.ex2.approx.f(float %335) #6, !dbg !33
531
+ br label %__nv_exp2f.exit36, !dbg !33
532
+
533
+ __nv_exp2f.exit36: ; preds = %400, %402
534
+ %.0.i35 = phi float [ %401, %400 ], [ %403, %402 ], !dbg !33
535
+ %404 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
536
+ %.not.i37 = icmp eq i32 %404, 0, !dbg !33
537
+ br i1 %.not.i37, label %407, label %405, !dbg !33
538
+
539
+ 405: ; preds = %__nv_exp2f.exit36
540
+ %406 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %336) #6, !dbg !33
541
+ br label %__nv_exp2f.exit39, !dbg !33
542
+
543
+ 407: ; preds = %__nv_exp2f.exit36
544
+ %408 = tail call float @llvm.nvvm.ex2.approx.f(float %336) #6, !dbg !33
545
+ br label %__nv_exp2f.exit39, !dbg !33
546
+
547
+ __nv_exp2f.exit39: ; preds = %405, %407
548
+ %.0.i38 = phi float [ %406, %405 ], [ %408, %407 ], !dbg !33
549
+ %409 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
550
+ %.not.i40 = icmp eq i32 %409, 0, !dbg !33
551
+ br i1 %.not.i40, label %412, label %410, !dbg !33
552
+
553
+ 410: ; preds = %__nv_exp2f.exit39
554
+ %411 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %337) #6, !dbg !33
555
+ br label %__nv_exp2f.exit42, !dbg !33
556
+
557
+ 412: ; preds = %__nv_exp2f.exit39
558
+ %413 = tail call float @llvm.nvvm.ex2.approx.f(float %337) #6, !dbg !33
559
+ br label %__nv_exp2f.exit42, !dbg !33
560
+
561
+ __nv_exp2f.exit42: ; preds = %410, %412
562
+ %.0.i41 = phi float [ %411, %410 ], [ %413, %412 ], !dbg !33
563
+ %414 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
564
+ %.not.i43 = icmp eq i32 %414, 0, !dbg !33
565
+ br i1 %.not.i43, label %417, label %415, !dbg !33
566
+
567
+ 415: ; preds = %__nv_exp2f.exit42
568
+ %416 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %338) #6, !dbg !33
569
+ br label %__nv_exp2f.exit45, !dbg !33
570
+
571
+ 417: ; preds = %__nv_exp2f.exit42
572
+ %418 = tail call float @llvm.nvvm.ex2.approx.f(float %338) #6, !dbg !33
573
+ br label %__nv_exp2f.exit45, !dbg !33
574
+
575
+ __nv_exp2f.exit45: ; preds = %415, %417
576
+ %.0.i44 = phi float [ %416, %415 ], [ %418, %417 ], !dbg !33
577
+ %419 = fmul float %.0.i, %81, !dbg !34
578
+ %420 = fmul float %.0.i2, %82, !dbg !34
579
+ %421 = fmul float %.0.i5, %83, !dbg !34
580
+ %422 = fmul float %.0.i8, %84, !dbg !34
581
+ %423 = fmul float %.0.i11, %90, !dbg !34
582
+ %424 = fmul float %.0.i14, %91, !dbg !34
583
+ %425 = fmul float %.0.i17, %92, !dbg !34
584
+ %426 = fmul float %.0.i20, %93, !dbg !34
585
+ %427 = fmul float %.0.i23, %99, !dbg !34
586
+ %428 = fmul float %.0.i26, %100, !dbg !34
587
+ %429 = fmul float %.0.i29, %101, !dbg !34
588
+ %430 = fmul float %.0.i32, %102, !dbg !34
589
+ %431 = fmul float %.0.i35, %108, !dbg !34
590
+ %432 = fmul float %.0.i38, %109, !dbg !34
591
+ %433 = fmul float %.0.i41, %110, !dbg !34
592
+ %434 = fmul float %.0.i44, %111, !dbg !34
593
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !35
594
+ %435 = fadd float %419, %423, !dbg !39
595
+ %436 = fadd float %420, %424, !dbg !39
596
+ %437 = fadd float %421, %425, !dbg !39
597
+ %438 = fadd float %422, %426, !dbg !39
598
+ %439 = fadd float %435, %427, !dbg !39
599
+ %440 = fadd float %436, %428, !dbg !39
600
+ %441 = fadd float %437, %429, !dbg !39
601
+ %442 = fadd float %438, %430, !dbg !39
602
+ %443 = fadd float %439, %431, !dbg !39
603
+ %444 = fadd float %440, %432, !dbg !39
604
+ %445 = fadd float %441, %433, !dbg !39
605
+ %446 = fadd float %442, %434, !dbg !39
606
+ %447 = bitcast float %443 to i32, !dbg !35
607
+ %448 = select i1 %17, i32 %447, i32 0, !dbg !39
608
+ %449 = insertelement <1 x i32> poison, i32 %448, i64 0, !dbg !35
609
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %177, <1 x i32> %449, i1 true) #6, !dbg !35
610
+ %450 = bitcast float %444 to i32, !dbg !35
611
+ %451 = select i1 %17, i32 %450, i32 0, !dbg !39
612
+ %452 = insertelement <1 x i32> poison, i32 %451, i64 0, !dbg !35
613
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %180, <1 x i32> %452, i1 true) #6, !dbg !35
614
+ %453 = bitcast float %445 to i32, !dbg !35
615
+ %454 = select i1 %17, i32 %453, i32 0, !dbg !39
616
+ %455 = insertelement <1 x i32> poison, i32 %454, i64 0, !dbg !35
617
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %183, <1 x i32> %455, i1 true) #6, !dbg !35
618
+ %456 = bitcast float %446 to i32, !dbg !35
619
+ %457 = select i1 %17, i32 %456, i32 0, !dbg !39
620
+ %458 = insertelement <1 x i32> poison, i32 %457, i64 0, !dbg !35
621
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %186, <1 x i32> %458, i1 true) #6, !dbg !35
622
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !35
623
+ %459 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %188, i1 true) #6, !dbg !35
624
+ %460 = bitcast i32 %459 to float, !dbg !35
625
+ %461 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %459, i32 4, i32 31), !dbg !35
626
+ %462 = bitcast i32 %461 to float, !dbg !35
627
+ %463 = fadd float %460, %462, !dbg !39
628
+ %464 = bitcast float %463 to i32, !dbg !35
629
+ %465 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %464, i32 2, i32 31), !dbg !35
630
+ %466 = bitcast i32 %465 to float, !dbg !35
631
+ %467 = fadd float %463, %466, !dbg !39
632
+ %468 = bitcast float %467 to i32, !dbg !35
633
+ %469 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %468, i32 1, i32 31), !dbg !35
634
+ %470 = bitcast i32 %469 to float, !dbg !35
635
+ %471 = fadd float %467, %470, !dbg !39
636
+ %472 = bitcast float %471 to <1 x i32>, !dbg !35
637
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %472, i1 %211) #6, !dbg !35
638
+ %473 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %214, i1 true) #6, !dbg !35
639
+ %474 = bitcast i32 %473 to float, !dbg !35
640
+ %475 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %473, i32 4, i32 31), !dbg !35
641
+ %476 = bitcast i32 %475 to float, !dbg !35
642
+ %477 = fadd float %474, %476, !dbg !39
643
+ %478 = bitcast float %477 to i32, !dbg !35
644
+ %479 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %478, i32 2, i32 31), !dbg !35
645
+ %480 = bitcast i32 %479 to float, !dbg !35
646
+ %481 = fadd float %477, %480, !dbg !39
647
+ %482 = bitcast float %481 to i32, !dbg !35
648
+ %483 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %482, i32 1, i32 31), !dbg !35
649
+ %484 = bitcast i32 %483 to float, !dbg !35
650
+ %485 = fadd float %481, %484, !dbg !39
651
+ %486 = bitcast float %485 to <1 x i32>, !dbg !35
652
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %214, <1 x i32> %486, i1 %211) #6, !dbg !35
653
+ %487 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %238, i1 true) #6, !dbg !35
654
+ %488 = bitcast i32 %487 to float, !dbg !35
655
+ %489 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %487, i32 4, i32 31), !dbg !35
656
+ %490 = bitcast i32 %489 to float, !dbg !35
657
+ %491 = fadd float %488, %490, !dbg !39
658
+ %492 = bitcast float %491 to i32, !dbg !35
659
+ %493 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %492, i32 2, i32 31), !dbg !35
660
+ %494 = bitcast i32 %493 to float, !dbg !35
661
+ %495 = fadd float %491, %494, !dbg !39
662
+ %496 = bitcast float %495 to i32, !dbg !35
663
+ %497 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %496, i32 1, i32 31), !dbg !35
664
+ %498 = bitcast i32 %497 to float, !dbg !35
665
+ %499 = fadd float %495, %498, !dbg !39
666
+ %500 = bitcast float %499 to <1 x i32>, !dbg !35
667
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %238, <1 x i32> %500, i1 %211) #6, !dbg !35
668
+ %501 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %262, i1 true) #6, !dbg !35
669
+ %502 = bitcast i32 %501 to float, !dbg !35
670
+ %503 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %501, i32 4, i32 31), !dbg !35
671
+ %504 = bitcast i32 %503 to float, !dbg !35
672
+ %505 = fadd float %502, %504, !dbg !39
673
+ %506 = bitcast float %505 to i32, !dbg !35
674
+ %507 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %506, i32 2, i32 31), !dbg !35
675
+ %508 = bitcast i32 %507 to float, !dbg !35
676
+ %509 = fadd float %505, %508, !dbg !39
677
+ %510 = bitcast float %509 to i32, !dbg !35
678
+ %511 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %510, i32 1, i32 31), !dbg !35
679
+ %512 = bitcast i32 %511 to float, !dbg !35
680
+ %513 = fadd float %509, %512, !dbg !39
681
+ %514 = bitcast float %513 to <1 x i32>, !dbg !35
682
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %262, <1 x i32> %514, i1 %211) #6, !dbg !35
683
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !35
684
+ %515 = load float, ptr addrspace(3) %176, align 16, !dbg !35
685
+ %516 = load float, ptr addrspace(3) %179, align 16, !dbg !35
686
+ %517 = load float, ptr addrspace(3) %182, align 16, !dbg !35
687
+ %518 = load float, ptr addrspace(3) %185, align 16, !dbg !35
688
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !40
689
+ %519 = insertelement <4 x float> poison, float %515, i64 0, !dbg !40
690
+ %520 = insertelement <4 x float> %519, float %516, i64 1, !dbg !40
691
+ %521 = insertelement <4 x float> %520, float %517, i64 2, !dbg !40
692
+ %522 = insertelement <4 x float> %521, float %518, i64 3, !dbg !40
693
+ store <4 x float> %522, ptr addrspace(3) %291, align 16, !dbg !40
694
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !40
695
+ %523 = tail call i32 asm sideeffect "ldmatrix.sync.aligned.m8n8.x1.shared.b16 {$0}, [$1];", "=r,r"(i32 %301) #6, !dbg !40
696
+ %524 = select i1 %303, float 1.000000e+00, float %515, !dbg !41
697
+ %525 = select i1 %304, float 1.000000e+00, float %516, !dbg !41
698
+ %526 = select i1 %305, float 1.000000e+00, float %517, !dbg !41
699
+ %527 = select i1 %306, float 1.000000e+00, float %518, !dbg !41
700
+ %528 = fcmp olt float %524, 0x3810000000000000, !dbg !42
701
+ %529 = fmul float %524, 0x4160000000000000, !dbg !42
702
+ %.02.i = select i1 %528, float %529, float %524, !dbg !42
703
+ %i.i.0.i = select i1 %528, float -2.300000e+01, float 0.000000e+00, !dbg !42
704
+ %530 = bitcast float %.02.i to i32, !dbg !42
705
+ %531 = add i32 %530, -1060439283, !dbg !42
706
+ %532 = and i32 %531, -8388608, !dbg !42
707
+ %533 = sub i32 %530, %532, !dbg !42
708
+ %534 = bitcast i32 %533 to float, !dbg !42
709
+ %535 = sitofp i32 %532 to float, !dbg !42
710
+ %536 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
711
+ %.not.i46 = icmp eq i32 %536, 0, !dbg !42
712
+ %537 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %535, float 0x3E80000000000000, float %i.i.0.i) #6, !dbg !42
713
+ %538 = tail call float @llvm.nvvm.fma.rn.f(float %535, float 0x3E80000000000000, float %i.i.0.i) #6, !dbg !42
714
+ %.08.i = select i1 %.not.i46, float %538, float %537, !dbg !42
715
+ %539 = fadd float %534, -1.000000e+00, !dbg !42
716
+ %540 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
717
+ %.not1.i = icmp eq i32 %540, 0, !dbg !42
718
+ %541 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0x3FB8D64FE0000000, float %539, float 0xBFC58FE600000000) #6, !dbg !42
719
+ %542 = tail call float @llvm.nvvm.fma.rn.f(float 0x3FB8D64FE0000000, float %539, float 0xBFC58FE600000000) #6, !dbg !42
720
+ %.010.i = select i1 %.not1.i, float %542, float %541, !dbg !42
721
+ %543 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
722
+ %.not2.i = icmp eq i32 %543, 0, !dbg !42
723
+ %544 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i, float %539, float 0x3FC5F9E540000000) #6, !dbg !42
724
+ %545 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i, float %539, float 0x3FC5F9E540000000) #6, !dbg !42
725
+ %.011.i = select i1 %.not2.i, float %545, float %544, !dbg !42
726
+ %546 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
727
+ %.not3.i = icmp eq i32 %546, 0, !dbg !42
728
+ %547 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i, float %539, float 0xBFC6E9C860000000) #6, !dbg !42
729
+ %548 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i, float %539, float 0xBFC6E9C860000000) #6, !dbg !42
730
+ %.012.i = select i1 %.not3.i, float %548, float %547, !dbg !42
731
+ %549 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
732
+ %.not4.i = icmp eq i32 %549, 0, !dbg !42
733
+ %550 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i, float %539, float 0x3FCA417E80000000) #6, !dbg !42
734
+ %551 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i, float %539, float 0x3FCA417E80000000) #6, !dbg !42
735
+ %.09.i = select i1 %.not4.i, float %551, float %550, !dbg !42
736
+ %552 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
737
+ %.not5.i = icmp eq i32 %552, 0, !dbg !42
738
+ %553 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i, float %539, float 0xBFCEC79160000000) #6, !dbg !42
739
+ %554 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i, float %539, float 0xBFCEC79160000000) #6, !dbg !42
740
+ %.05.i = select i1 %.not5.i, float %554, float %553, !dbg !42
741
+ %555 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
742
+ %.not6.i = icmp eq i32 %555, 0, !dbg !42
743
+ %556 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %539, float 0x3FD277F320000000) #6, !dbg !42
744
+ %557 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %539, float 0x3FD277F320000000) #6, !dbg !42
745
+ %.01.i = select i1 %.not6.i, float %557, float %556, !dbg !42
746
+ %558 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
747
+ %.not7.i = icmp eq i32 %558, 0, !dbg !42
748
+ %559 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i, float %539, float 0xBFD7154920000000) #6, !dbg !42
749
+ %560 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i, float %539, float 0xBFD7154920000000) #6, !dbg !42
750
+ %.0.i47 = select i1 %.not7.i, float %560, float %559, !dbg !42
751
+ %561 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
752
+ %.not8.i = icmp eq i32 %561, 0, !dbg !42
753
+ %562 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i47, float %539, float 0x3FDEC70940000000) #6, !dbg !42
754
+ %563 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i47, float %539, float 0x3FDEC70940000000) #6, !dbg !42
755
+ %.07.i = select i1 %.not8.i, float %563, float %562, !dbg !42
756
+ %564 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
757
+ %.not9.i = icmp eq i32 %564, 0, !dbg !42
758
+ %565 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %539, float 0xBFE7154760000000) #6, !dbg !42
759
+ %566 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %539, float 0xBFE7154760000000) #6, !dbg !42
760
+ %.06.i = select i1 %.not9.i, float %566, float %565, !dbg !42
761
+ %567 = fmul float %539, %.06.i, !dbg !42
762
+ %568 = fmul float %539, %567, !dbg !42
763
+ %569 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
764
+ %.not10.i = icmp eq i32 %569, 0, !dbg !42
765
+ %570 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %539, float 0x3FF7154760000000, float %568) #6, !dbg !42
766
+ %571 = tail call float @llvm.nvvm.fma.rn.f(float %539, float 0x3FF7154760000000, float %568) #6, !dbg !42
767
+ %.04.i = select i1 %.not10.i, float %571, float %570, !dbg !42
768
+ %572 = fadd float %.08.i, %.04.i, !dbg !42
769
+ %573 = icmp ugt i32 %530, 2139095039, !dbg !42
770
+ br i1 %573, label %__nv_fmaf_rn.exit.i.i, label %__nv_log2f.exit, !dbg !42
771
+
772
+ __nv_fmaf_rn.exit.i.i: ; preds = %__nv_exp2f.exit45
773
+ %574 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
774
+ %.not11.i = icmp eq i32 %574, 0, !dbg !42
775
+ %575 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !42
776
+ %576 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !42
777
+ %.03.i = select i1 %.not11.i, float %576, float %575, !dbg !42
778
+ br label %__nv_log2f.exit, !dbg !42
779
+
780
+ __nv_log2f.exit: ; preds = %__nv_exp2f.exit45, %__nv_fmaf_rn.exit.i.i
781
+ %r.i.0.i = phi float [ %.03.i, %__nv_fmaf_rn.exit.i.i ], [ %572, %__nv_exp2f.exit45 ], !dbg !42
782
+ %577 = fcmp olt float %525, 0x3810000000000000, !dbg !42
783
+ %578 = fmul float %525, 0x4160000000000000, !dbg !42
784
+ %.02.i48 = select i1 %577, float %578, float %525, !dbg !42
785
+ %i.i.0.i49 = select i1 %577, float -2.300000e+01, float 0.000000e+00, !dbg !42
786
+ %579 = bitcast float %.02.i48 to i32, !dbg !42
787
+ %580 = add i32 %579, -1060439283, !dbg !42
788
+ %581 = and i32 %580, -8388608, !dbg !42
789
+ %582 = sub i32 %579, %581, !dbg !42
790
+ %583 = bitcast i32 %582 to float, !dbg !42
791
+ %584 = sitofp i32 %581 to float, !dbg !42
792
+ %585 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
793
+ %.not.i50 = icmp eq i32 %585, 0, !dbg !42
794
+ %586 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %584, float 0x3E80000000000000, float %i.i.0.i49) #6, !dbg !42
795
+ %587 = tail call float @llvm.nvvm.fma.rn.f(float %584, float 0x3E80000000000000, float %i.i.0.i49) #6, !dbg !42
796
+ %.08.i51 = select i1 %.not.i50, float %587, float %586, !dbg !42
797
+ %588 = fadd float %583, -1.000000e+00, !dbg !42
798
+ %589 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
799
+ %.not1.i52 = icmp eq i32 %589, 0, !dbg !42
800
+ %590 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0x3FB8D64FE0000000, float %588, float 0xBFC58FE600000000) #6, !dbg !42
801
+ %591 = tail call float @llvm.nvvm.fma.rn.f(float 0x3FB8D64FE0000000, float %588, float 0xBFC58FE600000000) #6, !dbg !42
802
+ %.010.i53 = select i1 %.not1.i52, float %591, float %590, !dbg !42
803
+ %592 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
804
+ %.not2.i54 = icmp eq i32 %592, 0, !dbg !42
805
+ %593 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i53, float %588, float 0x3FC5F9E540000000) #6, !dbg !42
806
+ %594 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i53, float %588, float 0x3FC5F9E540000000) #6, !dbg !42
807
+ %.011.i55 = select i1 %.not2.i54, float %594, float %593, !dbg !42
808
+ %595 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
809
+ %.not3.i56 = icmp eq i32 %595, 0, !dbg !42
810
+ %596 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i55, float %588, float 0xBFC6E9C860000000) #6, !dbg !42
811
+ %597 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i55, float %588, float 0xBFC6E9C860000000) #6, !dbg !42
812
+ %.012.i57 = select i1 %.not3.i56, float %597, float %596, !dbg !42
813
+ %598 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
814
+ %.not4.i58 = icmp eq i32 %598, 0, !dbg !42
815
+ %599 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i57, float %588, float 0x3FCA417E80000000) #6, !dbg !42
816
+ %600 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i57, float %588, float 0x3FCA417E80000000) #6, !dbg !42
817
+ %.09.i59 = select i1 %.not4.i58, float %600, float %599, !dbg !42
818
+ %601 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
819
+ %.not5.i60 = icmp eq i32 %601, 0, !dbg !42
820
+ %602 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i59, float %588, float 0xBFCEC79160000000) #6, !dbg !42
821
+ %603 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i59, float %588, float 0xBFCEC79160000000) #6, !dbg !42
822
+ %.05.i61 = select i1 %.not5.i60, float %603, float %602, !dbg !42
823
+ %604 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
824
+ %.not6.i62 = icmp eq i32 %604, 0, !dbg !42
825
+ %605 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i61, float %588, float 0x3FD277F320000000) #6, !dbg !42
826
+ %606 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i61, float %588, float 0x3FD277F320000000) #6, !dbg !42
827
+ %.01.i63 = select i1 %.not6.i62, float %606, float %605, !dbg !42
828
+ %607 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
829
+ %.not7.i64 = icmp eq i32 %607, 0, !dbg !42
830
+ %608 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i63, float %588, float 0xBFD7154920000000) #6, !dbg !42
831
+ %609 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i63, float %588, float 0xBFD7154920000000) #6, !dbg !42
832
+ %.0.i65 = select i1 %.not7.i64, float %609, float %608, !dbg !42
833
+ %610 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
834
+ %.not8.i66 = icmp eq i32 %610, 0, !dbg !42
835
+ %611 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i65, float %588, float 0x3FDEC70940000000) #6, !dbg !42
836
+ %612 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i65, float %588, float 0x3FDEC70940000000) #6, !dbg !42
837
+ %.07.i67 = select i1 %.not8.i66, float %612, float %611, !dbg !42
838
+ %613 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
839
+ %.not9.i68 = icmp eq i32 %613, 0, !dbg !42
840
+ %614 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i67, float %588, float 0xBFE7154760000000) #6, !dbg !42
841
+ %615 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i67, float %588, float 0xBFE7154760000000) #6, !dbg !42
842
+ %.06.i69 = select i1 %.not9.i68, float %615, float %614, !dbg !42
843
+ %616 = fmul float %588, %.06.i69, !dbg !42
844
+ %617 = fmul float %588, %616, !dbg !42
845
+ %618 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
846
+ %.not10.i70 = icmp eq i32 %618, 0, !dbg !42
847
+ %619 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %588, float 0x3FF7154760000000, float %617) #6, !dbg !42
848
+ %620 = tail call float @llvm.nvvm.fma.rn.f(float %588, float 0x3FF7154760000000, float %617) #6, !dbg !42
849
+ %.04.i71 = select i1 %.not10.i70, float %620, float %619, !dbg !42
850
+ %621 = fadd float %.08.i51, %.04.i71, !dbg !42
851
+ %622 = icmp ugt i32 %579, 2139095039, !dbg !42
852
+ br i1 %622, label %__nv_fmaf_rn.exit.i.i74, label %__nv_log2f.exit77, !dbg !42
853
+
854
+ __nv_fmaf_rn.exit.i.i74: ; preds = %__nv_log2f.exit
855
+ %623 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
856
+ %.not11.i75 = icmp eq i32 %623, 0, !dbg !42
857
+ %624 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i48, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !42
858
+ %625 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i48, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !42
859
+ %.03.i76 = select i1 %.not11.i75, float %625, float %624, !dbg !42
860
+ br label %__nv_log2f.exit77, !dbg !42
861
+
862
+ __nv_log2f.exit77: ; preds = %__nv_log2f.exit, %__nv_fmaf_rn.exit.i.i74
863
+ %r.i.0.i72 = phi float [ %.03.i76, %__nv_fmaf_rn.exit.i.i74 ], [ %621, %__nv_log2f.exit ], !dbg !42
864
+ %626 = fcmp olt float %526, 0x3810000000000000, !dbg !42
865
+ %627 = fmul float %526, 0x4160000000000000, !dbg !42
866
+ %.02.i78 = select i1 %626, float %627, float %526, !dbg !42
867
+ %i.i.0.i79 = select i1 %626, float -2.300000e+01, float 0.000000e+00, !dbg !42
868
+ %628 = bitcast float %.02.i78 to i32, !dbg !42
869
+ %629 = add i32 %628, -1060439283, !dbg !42
870
+ %630 = and i32 %629, -8388608, !dbg !42
871
+ %631 = sub i32 %628, %630, !dbg !42
872
+ %632 = bitcast i32 %631 to float, !dbg !42
873
+ %633 = sitofp i32 %630 to float, !dbg !42
874
+ %634 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
875
+ %.not.i80 = icmp eq i32 %634, 0, !dbg !42
876
+ %635 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %633, float 0x3E80000000000000, float %i.i.0.i79) #6, !dbg !42
877
+ %636 = tail call float @llvm.nvvm.fma.rn.f(float %633, float 0x3E80000000000000, float %i.i.0.i79) #6, !dbg !42
878
+ %.08.i81 = select i1 %.not.i80, float %636, float %635, !dbg !42
879
+ %637 = fadd float %632, -1.000000e+00, !dbg !42
880
+ %638 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
881
+ %.not1.i82 = icmp eq i32 %638, 0, !dbg !42
882
+ %639 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0x3FB8D64FE0000000, float %637, float 0xBFC58FE600000000) #6, !dbg !42
883
+ %640 = tail call float @llvm.nvvm.fma.rn.f(float 0x3FB8D64FE0000000, float %637, float 0xBFC58FE600000000) #6, !dbg !42
884
+ %.010.i83 = select i1 %.not1.i82, float %640, float %639, !dbg !42
885
+ %641 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
886
+ %.not2.i84 = icmp eq i32 %641, 0, !dbg !42
887
+ %642 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i83, float %637, float 0x3FC5F9E540000000) #6, !dbg !42
888
+ %643 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i83, float %637, float 0x3FC5F9E540000000) #6, !dbg !42
889
+ %.011.i85 = select i1 %.not2.i84, float %643, float %642, !dbg !42
890
+ %644 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
891
+ %.not3.i86 = icmp eq i32 %644, 0, !dbg !42
892
+ %645 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i85, float %637, float 0xBFC6E9C860000000) #6, !dbg !42
893
+ %646 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i85, float %637, float 0xBFC6E9C860000000) #6, !dbg !42
894
+ %.012.i87 = select i1 %.not3.i86, float %646, float %645, !dbg !42
895
+ %647 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
896
+ %.not4.i88 = icmp eq i32 %647, 0, !dbg !42
897
+ %648 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i87, float %637, float 0x3FCA417E80000000) #6, !dbg !42
898
+ %649 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i87, float %637, float 0x3FCA417E80000000) #6, !dbg !42
899
+ %.09.i89 = select i1 %.not4.i88, float %649, float %648, !dbg !42
900
+ %650 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
901
+ %.not5.i90 = icmp eq i32 %650, 0, !dbg !42
902
+ %651 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i89, float %637, float 0xBFCEC79160000000) #6, !dbg !42
903
+ %652 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i89, float %637, float 0xBFCEC79160000000) #6, !dbg !42
904
+ %.05.i91 = select i1 %.not5.i90, float %652, float %651, !dbg !42
905
+ %653 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
906
+ %.not6.i92 = icmp eq i32 %653, 0, !dbg !42
907
+ %654 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i91, float %637, float 0x3FD277F320000000) #6, !dbg !42
908
+ %655 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i91, float %637, float 0x3FD277F320000000) #6, !dbg !42
909
+ %.01.i93 = select i1 %.not6.i92, float %655, float %654, !dbg !42
910
+ %656 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
911
+ %.not7.i94 = icmp eq i32 %656, 0, !dbg !42
912
+ %657 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i93, float %637, float 0xBFD7154920000000) #6, !dbg !42
913
+ %658 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i93, float %637, float 0xBFD7154920000000) #6, !dbg !42
914
+ %.0.i95 = select i1 %.not7.i94, float %658, float %657, !dbg !42
915
+ %659 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
916
+ %.not8.i96 = icmp eq i32 %659, 0, !dbg !42
917
+ %660 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i95, float %637, float 0x3FDEC70940000000) #6, !dbg !42
918
+ %661 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i95, float %637, float 0x3FDEC70940000000) #6, !dbg !42
919
+ %.07.i97 = select i1 %.not8.i96, float %661, float %660, !dbg !42
920
+ %662 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
921
+ %.not9.i98 = icmp eq i32 %662, 0, !dbg !42
922
+ %663 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i97, float %637, float 0xBFE7154760000000) #6, !dbg !42
923
+ %664 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i97, float %637, float 0xBFE7154760000000) #6, !dbg !42
924
+ %.06.i99 = select i1 %.not9.i98, float %664, float %663, !dbg !42
925
+ %665 = fmul float %637, %.06.i99, !dbg !42
926
+ %666 = fmul float %637, %665, !dbg !42
927
+ %667 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
928
+ %.not10.i100 = icmp eq i32 %667, 0, !dbg !42
929
+ %668 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %637, float 0x3FF7154760000000, float %666) #6, !dbg !42
930
+ %669 = tail call float @llvm.nvvm.fma.rn.f(float %637, float 0x3FF7154760000000, float %666) #6, !dbg !42
931
+ %.04.i101 = select i1 %.not10.i100, float %669, float %668, !dbg !42
932
+ %670 = fadd float %.08.i81, %.04.i101, !dbg !42
933
+ %671 = icmp ugt i32 %628, 2139095039, !dbg !42
934
+ br i1 %671, label %__nv_fmaf_rn.exit.i.i104, label %__nv_log2f.exit107, !dbg !42
935
+
936
+ __nv_fmaf_rn.exit.i.i104: ; preds = %__nv_log2f.exit77
937
+ %672 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
938
+ %.not11.i105 = icmp eq i32 %672, 0, !dbg !42
939
+ %673 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i78, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !42
940
+ %674 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i78, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !42
941
+ %.03.i106 = select i1 %.not11.i105, float %674, float %673, !dbg !42
942
+ br label %__nv_log2f.exit107, !dbg !42
943
+
944
+ __nv_log2f.exit107: ; preds = %__nv_log2f.exit77, %__nv_fmaf_rn.exit.i.i104
945
+ %r.i.0.i102 = phi float [ %.03.i106, %__nv_fmaf_rn.exit.i.i104 ], [ %670, %__nv_log2f.exit77 ], !dbg !42
946
+ %675 = fcmp olt float %527, 0x3810000000000000, !dbg !42
947
+ %676 = fmul float %527, 0x4160000000000000, !dbg !42
948
+ %.02.i108 = select i1 %675, float %676, float %527, !dbg !42
949
+ %i.i.0.i109 = select i1 %675, float -2.300000e+01, float 0.000000e+00, !dbg !42
950
+ %677 = bitcast float %.02.i108 to i32, !dbg !42
951
+ %678 = add i32 %677, -1060439283, !dbg !42
952
+ %679 = and i32 %678, -8388608, !dbg !42
953
+ %680 = sub i32 %677, %679, !dbg !42
954
+ %681 = bitcast i32 %680 to float, !dbg !42
955
+ %682 = sitofp i32 %679 to float, !dbg !42
956
+ %683 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
957
+ %.not.i110 = icmp eq i32 %683, 0, !dbg !42
958
+ %684 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %682, float 0x3E80000000000000, float %i.i.0.i109) #6, !dbg !42
959
+ %685 = tail call float @llvm.nvvm.fma.rn.f(float %682, float 0x3E80000000000000, float %i.i.0.i109) #6, !dbg !42
960
+ %.08.i111 = select i1 %.not.i110, float %685, float %684, !dbg !42
961
+ %686 = fadd float %681, -1.000000e+00, !dbg !42
962
+ %687 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
963
+ %.not1.i112 = icmp eq i32 %687, 0, !dbg !42
964
+ %688 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0x3FB8D64FE0000000, float %686, float 0xBFC58FE600000000) #6, !dbg !42
965
+ %689 = tail call float @llvm.nvvm.fma.rn.f(float 0x3FB8D64FE0000000, float %686, float 0xBFC58FE600000000) #6, !dbg !42
966
+ %.010.i113 = select i1 %.not1.i112, float %689, float %688, !dbg !42
967
+ %690 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
968
+ %.not2.i114 = icmp eq i32 %690, 0, !dbg !42
969
+ %691 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i113, float %686, float 0x3FC5F9E540000000) #6, !dbg !42
970
+ %692 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i113, float %686, float 0x3FC5F9E540000000) #6, !dbg !42
971
+ %.011.i115 = select i1 %.not2.i114, float %692, float %691, !dbg !42
972
+ %693 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
973
+ %.not3.i116 = icmp eq i32 %693, 0, !dbg !42
974
+ %694 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i115, float %686, float 0xBFC6E9C860000000) #6, !dbg !42
975
+ %695 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i115, float %686, float 0xBFC6E9C860000000) #6, !dbg !42
976
+ %.012.i117 = select i1 %.not3.i116, float %695, float %694, !dbg !42
977
+ %696 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
978
+ %.not4.i118 = icmp eq i32 %696, 0, !dbg !42
979
+ %697 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i117, float %686, float 0x3FCA417E80000000) #6, !dbg !42
980
+ %698 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i117, float %686, float 0x3FCA417E80000000) #6, !dbg !42
981
+ %.09.i119 = select i1 %.not4.i118, float %698, float %697, !dbg !42
982
+ %699 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
983
+ %.not5.i120 = icmp eq i32 %699, 0, !dbg !42
984
+ %700 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i119, float %686, float 0xBFCEC79160000000) #6, !dbg !42
985
+ %701 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i119, float %686, float 0xBFCEC79160000000) #6, !dbg !42
986
+ %.05.i121 = select i1 %.not5.i120, float %701, float %700, !dbg !42
987
+ %702 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
988
+ %.not6.i122 = icmp eq i32 %702, 0, !dbg !42
989
+ %703 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i121, float %686, float 0x3FD277F320000000) #6, !dbg !42
990
+ %704 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i121, float %686, float 0x3FD277F320000000) #6, !dbg !42
991
+ %.01.i123 = select i1 %.not6.i122, float %704, float %703, !dbg !42
992
+ %705 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
993
+ %.not7.i124 = icmp eq i32 %705, 0, !dbg !42
994
+ %706 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i123, float %686, float 0xBFD7154920000000) #6, !dbg !42
995
+ %707 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i123, float %686, float 0xBFD7154920000000) #6, !dbg !42
996
+ %.0.i125 = select i1 %.not7.i124, float %707, float %706, !dbg !42
997
+ %708 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
998
+ %.not8.i126 = icmp eq i32 %708, 0, !dbg !42
999
+ %709 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i125, float %686, float 0x3FDEC70940000000) #6, !dbg !42
1000
+ %710 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i125, float %686, float 0x3FDEC70940000000) #6, !dbg !42
1001
+ %.07.i127 = select i1 %.not8.i126, float %710, float %709, !dbg !42
1002
+ %711 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
1003
+ %.not9.i128 = icmp eq i32 %711, 0, !dbg !42
1004
+ %712 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i127, float %686, float 0xBFE7154760000000) #6, !dbg !42
1005
+ %713 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i127, float %686, float 0xBFE7154760000000) #6, !dbg !42
1006
+ %.06.i129 = select i1 %.not9.i128, float %713, float %712, !dbg !42
1007
+ %714 = fmul float %686, %.06.i129, !dbg !42
1008
+ %715 = fmul float %686, %714, !dbg !42
1009
+ %716 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
1010
+ %.not10.i130 = icmp eq i32 %716, 0, !dbg !42
1011
+ %717 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %686, float 0x3FF7154760000000, float %715) #6, !dbg !42
1012
+ %718 = tail call float @llvm.nvvm.fma.rn.f(float %686, float 0x3FF7154760000000, float %715) #6, !dbg !42
1013
+ %.04.i131 = select i1 %.not10.i130, float %718, float %717, !dbg !42
1014
+ %719 = fadd float %.08.i111, %.04.i131, !dbg !42
1015
+ %720 = icmp ugt i32 %677, 2139095039, !dbg !42
1016
+ br i1 %720, label %__nv_fmaf_rn.exit.i.i134, label %__nv_log2f.exit137, !dbg !42
1017
+
1018
+ __nv_fmaf_rn.exit.i.i134: ; preds = %__nv_log2f.exit107
1019
+ %721 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
1020
+ %.not11.i135 = icmp eq i32 %721, 0, !dbg !42
1021
+ %722 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i108, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !42
1022
+ %723 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i108, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !42
1023
+ %.03.i136 = select i1 %.not11.i135, float %723, float %722, !dbg !42
1024
+ br label %__nv_log2f.exit137, !dbg !42
1025
+
1026
+ __nv_log2f.exit137: ; preds = %__nv_log2f.exit107, %__nv_fmaf_rn.exit.i.i134
1027
+ %r.i.0.i132 = phi float [ %.03.i136, %__nv_fmaf_rn.exit.i.i134 ], [ %719, %__nv_log2f.exit107 ], !dbg !42
1028
+ %724 = insertelement <4 x float> poison, float %.02.i, i64 0, !dbg !42
1029
+ %725 = insertelement <4 x float> %724, float %.02.i48, i64 1, !dbg !42
1030
+ %726 = insertelement <4 x float> %725, float %.02.i78, i64 2, !dbg !42
1031
+ %727 = insertelement <4 x float> %726, float %.02.i108, i64 3, !dbg !42
1032
+ %728 = fcmp oeq <4 x float> %727, zeroinitializer, !dbg !42
1033
+ %729 = and i32 %12, 127, !dbg !10
1034
+ %730 = or disjoint i32 %11, %729, !dbg !11
1035
+ %731 = icmp slt i32 %730, 1536, !dbg !12
1036
+ %732 = insertelement <4 x float> poison, float %r.i.0.i, i64 0, !dbg !42
1037
+ %733 = insertelement <4 x float> %732, float %r.i.0.i72, i64 1, !dbg !42
1038
+ %734 = insertelement <4 x float> %733, float %r.i.0.i102, i64 2, !dbg !42
1039
+ %735 = insertelement <4 x float> %734, float %r.i.0.i132, i64 3, !dbg !42
1040
+ %736 = select <4 x i1> %728, <4 x float> splat (float 0xFFF0000000000000), <4 x float> %735, !dbg !42
1041
+ %737 = insertelement <4 x float> poison, float %286, i64 0, !dbg !43
1042
+ %738 = insertelement <4 x float> %737, float %287, i64 1, !dbg !43
1043
+ %739 = insertelement <4 x float> %738, float %288, i64 2, !dbg !43
1044
+ %740 = insertelement <4 x float> %739, float %289, i64 3, !dbg !43
1045
+ %741 = fadd <4 x float> %740, %736, !dbg !43
1046
+ %742 = fmul <4 x float> %741, splat (float 0x3FE62E4300000000), !dbg !44
1047
+ %743 = sext i32 %730 to i64, !dbg !45
1048
+ %744 = getelementptr float, ptr addrspace(1) %4, i64 %743, !dbg !45
1049
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46
1050
+ store <4 x float> %742, ptr addrspace(3) %291, align 16, !dbg !46
1051
+ tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46
1052
+ %745 = tail call i32 asm sideeffect "ldmatrix.sync.aligned.m8n8.x1.shared.b16 {$0}, [$1];", "=r,r"(i32 %301) #6, !dbg !46
1053
+ %746 = and i32 %12, 128, !dbg !46
1054
+ %747 = icmp eq i32 %746, 0, !dbg !46
1055
+ %748 = and i1 %747, %731, !dbg !46
1056
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %745, ptr addrspace(1) %744, i1 %748) #6, !dbg !46
1057
+ %749 = getelementptr float, ptr addrspace(1) %2, i64 %743, !dbg !47
1058
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %302, ptr addrspace(1) %749, i1 %748) #6, !dbg !48
1059
+ %750 = getelementptr float, ptr addrspace(1) %3, i64 %743, !dbg !49
1060
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %523, ptr addrspace(1) %750, i1 %748) #6, !dbg !50
1061
+ ret void, !dbg !51
1062
+ }
1063
+
1064
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
1065
+ declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
1066
+
1067
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
1068
+ declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
1069
+
1070
+ ; Function Attrs: convergent nocallback nounwind
1071
+ declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2
1072
+
1073
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
1074
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3
1075
+
1076
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
1077
+
1078
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
1079
+ declare float @llvm.nvvm.ex2.approx.ftz.f(float) #5
1080
+
1081
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
1082
+ declare float @llvm.nvvm.ex2.approx.f(float) #5
1083
+
1084
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
1085
+ declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #1
1086
+
1087
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
1088
+ declare float @llvm.nvvm.fma.rn.f(float, float, float) #1
1089
+
1090
+ attributes #0 = { nounwind "nvvm.reqntid"="256" }
1091
+ attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
1092
+ attributes #2 = { convergent nocallback nounwind }
1093
+ attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
1094
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
1095
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
1096
+ attributes #6 = { nounwind }
1097
+
1098
+ !llvm.dbg.cu = !{!0}
1099
+ !llvm.module.flags = !{!2, !3}
1100
+ !llvm.ident = !{!4}
1101
+
1102
+ !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
1103
+ !1 = !DIFile(filename: "cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py", directory: "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5")
1104
+ !2 = !{i32 2, !"Debug Info Version", i32 3}
1105
+ !3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
1106
+ !4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
1107
+ !5 = distinct !DISubprogram(name: "triton_per_fused_mul_1", linkageName: "triton_per_fused_mul_1", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
1108
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
1109
+ !7 = !{}
1110
+ !8 = !DILocation(line: 24, column: 28, scope: !5)
1111
+ !9 = !DILocation(line: 24, column: 33, scope: !5)
1112
+ !10 = !DILocation(line: 25, column: 44, scope: !5)
1113
+ !11 = !DILocation(line: 25, column: 23, scope: !5)
1114
+ !12 = !DILocation(line: 26, column: 21, scope: !5)
1115
+ !13 = !DILocation(line: 27, column: 38, scope: !5)
1116
+ !14 = !DILocation(line: 34, column: 40, scope: !5)
1117
+ !15 = !DILocation(line: 34, column: 35, scope: !5)
1118
+ !16 = !DILocation(line: 34, column: 30, scope: !5)
1119
+ !17 = !DILocation(line: 34, column: 47, scope: !5)
1120
+ !18 = !DILocation(line: 35, column: 30, scope: !5)
1121
+ !19 = !DILocation(line: 35, column: 47, scope: !5)
1122
+ !20 = !DILocation(line: 37, column: 33, scope: !5)
1123
+ !21 = !DILocation(line: 110, column: 15, scope: !22, inlinedAt: !24)
1124
+ !22 = distinct !DILexicalBlockFile(scope: !5, file: !23, discriminator: 0)
1125
+ !23 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime")
1126
+ !24 = !DILocation(line: 38, column: 37, scope: !5)
1127
+ !25 = !DILocation(line: 112, column: 21, scope: !22, inlinedAt: !24)
1128
+ !26 = !DILocation(line: 112, column: 16, scope: !22, inlinedAt: !24)
1129
+ !27 = !DILocation(line: 113, column: 29, scope: !22, inlinedAt: !24)
1130
+ !28 = !DILocation(line: 123, column: 29, scope: !22, inlinedAt: !24)
1131
+ !29 = !DILocation(line: 38, column: 40, scope: !5)
1132
+ !30 = !DILocation(line: 40, column: 19, scope: !5)
1133
+ !31 = !DILocation(line: 41, column: 18, scope: !5)
1134
+ !32 = !DILocation(line: 43, column: 33, scope: !5)
1135
+ !33 = !DILocation(line: 44, column: 27, scope: !5)
1136
+ !34 = !DILocation(line: 45, column: 19, scope: !5)
1137
+ !35 = !DILocation(line: 291, column: 36, scope: !36, inlinedAt: !38)
1138
+ !36 = distinct !DILexicalBlockFile(scope: !5, file: !37, discriminator: 0)
1139
+ !37 = !DIFile(filename: "standard.py", directory: "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language")
1140
+ !38 = !DILocation(line: 48, column: 26, scope: !5)
1141
+ !39 = !DILocation(line: 261, column: 15, scope: !36, inlinedAt: !38)
1142
+ !40 = !DILocation(line: 48, column: 29, scope: !5)
1143
+ !41 = !DILocation(line: 50, column: 34, scope: !5)
1144
+ !42 = !DILocation(line: 51, column: 27, scope: !5)
1145
+ !43 = !DILocation(line: 52, column: 20, scope: !5)
1146
+ !44 = !DILocation(line: 54, column: 20, scope: !5)
1147
+ !45 = !DILocation(line: 55, column: 25, scope: !5)
1148
+ !46 = !DILocation(line: 55, column: 37, scope: !5)
1149
+ !47 = !DILocation(line: 56, column: 25, scope: !5)
1150
+ !48 = !DILocation(line: 56, column: 36, scope: !5)
1151
+ !49 = !DILocation(line: 57, column: 25, scope: !5)
1152
+ !50 = !DILocation(line: 57, column: 37, scope: !5)
1153
+ !51 = !DILocation(line: 57, column: 4, scope: !5)
progress/SpecForge/cache/compiled_kernels/triton/3/2GPX4H2RCLQEV2LIJWPK5V4BGHUBOBJI3FUMPIZKQJ7CMB34OTAA/triton_per_fused_mul_1.ptx ADDED
@@ -0,0 +1,1060 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.7
6
+ .target sm_90a
7
+ .address_size 64
8
+
9
+ // .globl triton_per_fused_mul_1 // -- Begin function triton_per_fused_mul_1
10
+ .extern .shared .align 16 .b8 global_smem[];
11
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
12
+ // @triton_per_fused_mul_1
13
+ .visible .entry triton_per_fused_mul_1(
14
+ .param .u64 .ptr .global .align 1 triton_per_fused_mul_1_param_0,
15
+ .param .u64 .ptr .global .align 1 triton_per_fused_mul_1_param_1,
16
+ .param .u64 .ptr .global .align 1 triton_per_fused_mul_1_param_2,
17
+ .param .u64 .ptr .global .align 1 triton_per_fused_mul_1_param_3,
18
+ .param .u64 .ptr .global .align 1 triton_per_fused_mul_1_param_4,
19
+ .param .u32 triton_per_fused_mul_1_param_5,
20
+ .param .u32 triton_per_fused_mul_1_param_6,
21
+ .param .u64 .ptr .global .align 1 triton_per_fused_mul_1_param_7,
22
+ .param .u64 .ptr .global .align 1 triton_per_fused_mul_1_param_8
23
+ )
24
+ .reqntid 256
25
+ {
26
+ .reg .pred %p<102>;
27
+ .reg .b32 %r<484>;
28
+ .reg .b64 %rd<24>;
29
+ .loc 1 18 0 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:18:0
30
+ $L__func_begin0:
31
+ .loc 1 18 0 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:18:0
32
+
33
+ // %bb.0: // %__nv_exp2f.exit
34
+ ld.param.b64 %rd14, [triton_per_fused_mul_1_param_0];
35
+ ld.param.b64 %rd15, [triton_per_fused_mul_1_param_1];
36
+ $L__tmp0:
37
+ .loc 1 24 28 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:24:28
38
+ mov.u32 %r142, %ctaid.x;
39
+ .loc 1 24 33 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:24:33
40
+ shl.b32 %r1, %r142, 7;
41
+ .loc 1 25 44 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:25:44
42
+ mov.u32 %r2, %tid.x;
43
+ shr.u32 %r143, %r2, 5;
44
+ and.b32 %r144, %r2, 31;
45
+ shl.b32 %r145, %r144, 2;
46
+ .loc 1 25 23 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:25:23
47
+ or.b32 %r146, %r145, %r1;
48
+ .loc 1 26 21 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:26:21
49
+ setp.lt.s32 %p1, %r146, 1536;
50
+ .loc 1 27 38 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:27:38
51
+ bfe.u32 %r147, %r2, 5, 3;
52
+ or.b32 %r148, %r143, 24;
53
+ .loc 1 34 35 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:34:35
54
+ mad.lo.s32 %r149, %r147, 1536, %r146;
55
+ add.s32 %r150, %r149, 12288;
56
+ add.s32 %r151, %r149, 24576;
57
+ mad.lo.s32 %r152, %r148, 1536, %r146;
58
+ .loc 1 34 30 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:34:30
59
+ mul.wide.s32 %rd16, %r149, 4;
60
+ add.s64 %rd6, %rd14, %rd16;
61
+ mul.wide.s32 %rd17, %r150, 4;
62
+ add.s64 %rd7, %rd14, %rd17;
63
+ mul.wide.s32 %rd18, %r151, 4;
64
+ add.s64 %rd8, %rd14, %rd18;
65
+ mul.wide.s32 %rd19, %r152, 4;
66
+ add.s64 %rd9, %rd14, %rd19;
67
+ mov.b32 %r30, 0;
68
+ .loc 1 34 47 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:34:47
69
+ // begin inline asm
70
+ mov.u32 %r26, %r30;
71
+ mov.u32 %r27, %r30;
72
+ mov.u32 %r28, %r30;
73
+ mov.u32 %r29, %r30;
74
+ @%p1 ld.global.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd6 + 0 ];
75
+ // end inline asm
76
+ // begin inline asm
77
+ mov.u32 %r34, %r30;
78
+ mov.u32 %r35, %r30;
79
+ mov.u32 %r36, %r30;
80
+ mov.u32 %r37, %r30;
81
+ @%p1 ld.global.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd7 + 0 ];
82
+ // end inline asm
83
+ // begin inline asm
84
+ mov.u32 %r42, %r30;
85
+ mov.u32 %r43, %r30;
86
+ mov.u32 %r44, %r30;
87
+ mov.u32 %r45, %r30;
88
+ @%p1 ld.global.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd8 + 0 ];
89
+ // end inline asm
90
+ // begin inline asm
91
+ mov.u32 %r50, %r30;
92
+ mov.u32 %r51, %r30;
93
+ mov.u32 %r52, %r30;
94
+ mov.u32 %r53, %r30;
95
+ @%p1 ld.global.v4.b32 { %r50, %r51, %r52, %r53 }, [ %rd9 + 0 ];
96
+ // end inline asm
97
+ .loc 1 35 30 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:35:30
98
+ add.s64 %rd10, %rd15, %rd16;
99
+ add.s64 %rd11, %rd15, %rd17;
100
+ add.s64 %rd12, %rd15, %rd18;
101
+ add.s64 %rd13, %rd15, %rd19;
102
+ .loc 1 35 47 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:35:47
103
+ // begin inline asm
104
+ mov.u32 %r58, %r30;
105
+ mov.u32 %r59, %r30;
106
+ mov.u32 %r60, %r30;
107
+ mov.u32 %r61, %r30;
108
+ @%p1 ld.global.v4.b32 { %r58, %r59, %r60, %r61 }, [ %rd10 + 0 ];
109
+ // end inline asm
110
+ // begin inline asm
111
+ mov.u32 %r66, %r30;
112
+ mov.u32 %r67, %r30;
113
+ mov.u32 %r68, %r30;
114
+ mov.u32 %r69, %r30;
115
+ @%p1 ld.global.v4.b32 { %r66, %r67, %r68, %r69 }, [ %rd11 + 0 ];
116
+ // end inline asm
117
+ // begin inline asm
118
+ mov.u32 %r74, %r30;
119
+ mov.u32 %r75, %r30;
120
+ mov.u32 %r76, %r30;
121
+ mov.u32 %r77, %r30;
122
+ @%p1 ld.global.v4.b32 { %r74, %r75, %r76, %r77 }, [ %rd12 + 0 ];
123
+ // end inline asm
124
+ // begin inline asm
125
+ mov.u32 %r82, %r30;
126
+ mov.u32 %r83, %r30;
127
+ mov.u32 %r84, %r30;
128
+ mov.u32 %r85, %r30;
129
+ @%p1 ld.global.v4.b32 { %r82, %r83, %r84, %r85 }, [ %rd13 + 0 ];
130
+ // end inline asm
131
+ .loc 1 37 33 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:37:33
132
+ selp.f32 %r153, %r26, 0fFF800000, %p1;
133
+ selp.f32 %r154, %r27, 0fFF800000, %p1;
134
+ selp.f32 %r155, %r28, 0fFF800000, %p1;
135
+ selp.f32 %r156, %r29, 0fFF800000, %p1;
136
+ selp.f32 %r157, %r34, 0fFF800000, %p1;
137
+ selp.f32 %r158, %r35, 0fFF800000, %p1;
138
+ selp.f32 %r159, %r36, 0fFF800000, %p1;
139
+ selp.f32 %r160, %r37, 0fFF800000, %p1;
140
+ selp.f32 %r161, %r42, 0fFF800000, %p1;
141
+ selp.f32 %r162, %r43, 0fFF800000, %p1;
142
+ selp.f32 %r163, %r44, 0fFF800000, %p1;
143
+ selp.f32 %r164, %r45, 0fFF800000, %p1;
144
+ selp.f32 %r165, %r50, 0fFF800000, %p1;
145
+ selp.f32 %r166, %r51, 0fFF800000, %p1;
146
+ selp.f32 %r167, %r52, 0fFF800000, %p1;
147
+ selp.f32 %r168, %r53, 0fFF800000, %p1;
148
+ $L__tmp1:
149
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
150
+ setp.gt.f32 %p33, %r153, %r157;
151
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
152
+ setp.nan.f32 %p34, %r153, %r153;
153
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
154
+ selp.f32 %r169, %r153, %r157, %p33;
155
+ selp.f32 %r170, %r153, %r169, %p34;
156
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
157
+ setp.gt.f32 %p35, %r154, %r158;
158
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
159
+ setp.nan.f32 %p36, %r154, %r154;
160
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
161
+ selp.f32 %r171, %r154, %r158, %p35;
162
+ selp.f32 %r172, %r154, %r171, %p36;
163
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
164
+ setp.gt.f32 %p37, %r155, %r159;
165
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
166
+ setp.nan.f32 %p38, %r155, %r155;
167
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
168
+ selp.f32 %r173, %r155, %r159, %p37;
169
+ selp.f32 %r174, %r155, %r173, %p38;
170
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
171
+ setp.gt.f32 %p39, %r156, %r160;
172
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
173
+ setp.nan.f32 %p40, %r156, %r156;
174
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
175
+ selp.f32 %r175, %r156, %r160, %p39;
176
+ selp.f32 %r176, %r156, %r175, %p40;
177
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
178
+ setp.gt.f32 %p41, %r170, %r161;
179
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
180
+ setp.nan.f32 %p42, %r170, %r170;
181
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
182
+ selp.f32 %r177, %r170, %r161, %p42;
183
+ selp.f32 %r178, %r170, %r177, %p41;
184
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
185
+ setp.gt.f32 %p43, %r172, %r162;
186
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
187
+ setp.nan.f32 %p44, %r172, %r172;
188
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
189
+ selp.f32 %r179, %r172, %r162, %p44;
190
+ selp.f32 %r180, %r172, %r179, %p43;
191
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
192
+ setp.gt.f32 %p45, %r174, %r163;
193
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
194
+ setp.nan.f32 %p46, %r174, %r174;
195
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
196
+ selp.f32 %r181, %r174, %r163, %p46;
197
+ selp.f32 %r182, %r174, %r181, %p45;
198
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
199
+ setp.gt.f32 %p47, %r176, %r164;
200
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
201
+ setp.nan.f32 %p48, %r176, %r176;
202
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
203
+ selp.f32 %r183, %r176, %r164, %p48;
204
+ selp.f32 %r184, %r176, %r183, %p47;
205
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
206
+ setp.gt.f32 %p49, %r178, %r165;
207
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
208
+ setp.nan.f32 %p50, %r178, %r178;
209
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
210
+ selp.f32 %r185, %r178, %r165, %p50;
211
+ selp.f32 %r91, %r178, %r185, %p49;
212
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
213
+ setp.gt.f32 %p51, %r180, %r166;
214
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
215
+ setp.nan.f32 %p52, %r180, %r180;
216
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
217
+ selp.f32 %r186, %r180, %r166, %p52;
218
+ selp.f32 %r93, %r180, %r186, %p51;
219
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
220
+ setp.gt.f32 %p53, %r182, %r167;
221
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
222
+ setp.nan.f32 %p54, %r182, %r182;
223
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
224
+ selp.f32 %r187, %r182, %r167, %p54;
225
+ selp.f32 %r95, %r182, %r187, %p53;
226
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
227
+ setp.gt.f32 %p55, %r184, %r168;
228
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
229
+ setp.nan.f32 %p56, %r184, %r184;
230
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
231
+ selp.f32 %r188, %r184, %r168, %p56;
232
+ selp.f32 %r97, %r184, %r188, %p55;
233
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
234
+ shl.b32 %r189, %r144, 7;
235
+ mov.b32 %r190, global_smem;
236
+ add.s32 %r191, %r190, %r189;
237
+ shl.b32 %r192, %r147, 2;
238
+ add.s32 %r90, %r191, %r192;
239
+ mov.pred %p9, -1;
240
+ // begin inline asm
241
+ @%p9 st.shared.b32 [ %r90 + 0 ], %r91;
242
+ // end inline asm
243
+ add.s32 %r92, %r90, 32;
244
+ // begin inline asm
245
+ @%p9 st.shared.b32 [ %r92 + 0 ], %r93;
246
+ // end inline asm
247
+ add.s32 %r94, %r90, 64;
248
+ // begin inline asm
249
+ @%p9 st.shared.b32 [ %r94 + 0 ], %r95;
250
+ // end inline asm
251
+ add.s32 %r96, %r90, 96;
252
+ // begin inline asm
253
+ @%p9 st.shared.b32 [ %r96 + 0 ], %r97;
254
+ // end inline asm
255
+ bar.sync 0;
256
+ shl.b32 %r193, %r2, 2;
257
+ add.s32 %r99, %r190, %r193;
258
+ // begin inline asm
259
+ @%p9 ld.shared.b32 %r98, [ %r99 + 0 ];
260
+ // end inline asm
261
+ shfl.sync.bfly.b32 %r194, %r98, 4, 31, -1;
262
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
263
+ setp.gt.f32 %p57, %r98, %r194;
264
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
265
+ setp.nan.f32 %p58, %r98, %r98;
266
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
267
+ selp.f32 %r195, %r98, %r194, %p57;
268
+ selp.f32 %r196, %r98, %r195, %p58;
269
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
270
+ shfl.sync.bfly.b32 %r197, %r196, 2, 31, -1;
271
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
272
+ setp.gt.f32 %p59, %r196, %r197;
273
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
274
+ setp.nan.f32 %p60, %r196, %r196;
275
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
276
+ selp.f32 %r198, %r196, %r197, %p60;
277
+ selp.f32 %r199, %r196, %r198, %p59;
278
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
279
+ shfl.sync.bfly.b32 %r200, %r199, 1, 31, -1;
280
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
281
+ setp.gt.f32 %p61, %r199, %r200;
282
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
283
+ setp.nan.f32 %p62, %r199, %r199;
284
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
285
+ and.b32 %r201, %r2, 7;
286
+ setp.eq.b32 %p14, %r201, 0;
287
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
288
+ selp.b32 %r202, %r199, %r200, %p62;
289
+ selp.b32 %r101, %r199, %r202, %p61;
290
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
291
+ // begin inline asm
292
+ @%p14 st.shared.b32 [ %r99 + 0 ], %r101;
293
+ // end inline asm
294
+ add.s32 %r103, %r99, 1024;
295
+ // begin inline asm
296
+ @%p9 ld.shared.b32 %r102, [ %r103 + 0 ];
297
+ // end inline asm
298
+ shfl.sync.bfly.b32 %r203, %r102, 4, 31, -1;
299
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
300
+ setp.gt.f32 %p63, %r102, %r203;
301
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
302
+ setp.nan.f32 %p64, %r102, %r102;
303
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
304
+ selp.f32 %r204, %r102, %r203, %p63;
305
+ selp.f32 %r205, %r102, %r204, %p64;
306
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
307
+ shfl.sync.bfly.b32 %r206, %r205, 2, 31, -1;
308
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
309
+ setp.gt.f32 %p65, %r205, %r206;
310
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
311
+ setp.nan.f32 %p66, %r205, %r205;
312
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
313
+ selp.f32 %r207, %r205, %r206, %p66;
314
+ selp.f32 %r208, %r205, %r207, %p65;
315
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
316
+ shfl.sync.bfly.b32 %r209, %r208, 1, 31, -1;
317
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
318
+ setp.gt.f32 %p67, %r208, %r209;
319
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
320
+ setp.nan.f32 %p68, %r208, %r208;
321
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
322
+ selp.b32 %r210, %r208, %r209, %p68;
323
+ selp.b32 %r105, %r208, %r210, %p67;
324
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
325
+ // begin inline asm
326
+ @%p14 st.shared.b32 [ %r103 + 0 ], %r105;
327
+ // end inline asm
328
+ add.s32 %r107, %r99, 2048;
329
+ // begin inline asm
330
+ @%p9 ld.shared.b32 %r106, [ %r107 + 0 ];
331
+ // end inline asm
332
+ shfl.sync.bfly.b32 %r211, %r106, 4, 31, -1;
333
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
334
+ setp.gt.f32 %p69, %r106, %r211;
335
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
336
+ setp.nan.f32 %p70, %r106, %r106;
337
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
338
+ selp.f32 %r212, %r106, %r211, %p69;
339
+ selp.f32 %r213, %r106, %r212, %p70;
340
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
341
+ shfl.sync.bfly.b32 %r214, %r213, 2, 31, -1;
342
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
343
+ setp.gt.f32 %p71, %r213, %r214;
344
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
345
+ setp.nan.f32 %p72, %r213, %r213;
346
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
347
+ selp.f32 %r215, %r213, %r214, %p72;
348
+ selp.f32 %r216, %r213, %r215, %p71;
349
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
350
+ shfl.sync.bfly.b32 %r217, %r216, 1, 31, -1;
351
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
352
+ setp.gt.f32 %p73, %r216, %r217;
353
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
354
+ setp.nan.f32 %p74, %r216, %r216;
355
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
356
+ selp.b32 %r218, %r216, %r217, %p74;
357
+ selp.b32 %r109, %r216, %r218, %p73;
358
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
359
+ // begin inline asm
360
+ @%p14 st.shared.b32 [ %r107 + 0 ], %r109;
361
+ // end inline asm
362
+ add.s32 %r111, %r99, 3072;
363
+ // begin inline asm
364
+ @%p9 ld.shared.b32 %r110, [ %r111 + 0 ];
365
+ // end inline asm
366
+ shfl.sync.bfly.b32 %r219, %r110, 4, 31, -1;
367
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
368
+ setp.gt.f32 %p75, %r110, %r219;
369
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
370
+ setp.nan.f32 %p76, %r110, %r110;
371
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
372
+ selp.f32 %r220, %r110, %r219, %p75;
373
+ selp.f32 %r221, %r110, %r220, %p76;
374
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
375
+ shfl.sync.bfly.b32 %r222, %r221, 2, 31, -1;
376
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
377
+ setp.gt.f32 %p77, %r221, %r222;
378
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
379
+ setp.nan.f32 %p78, %r221, %r221;
380
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
381
+ selp.f32 %r223, %r221, %r222, %p78;
382
+ selp.f32 %r224, %r221, %r223, %p77;
383
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
384
+ shfl.sync.bfly.b32 %r225, %r224, 1, 31, -1;
385
+ .loc 2 110 15 // triton_helpers.py:110:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
386
+ setp.gt.f32 %p79, %r224, %r225;
387
+ .loc 2 112 21 // triton_helpers.py:112:21 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
388
+ setp.nan.f32 %p80, %r224, %r224;
389
+ .loc 2 113 29 // triton_helpers.py:113:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
390
+ selp.b32 %r226, %r224, %r225, %p80;
391
+ selp.b32 %r113, %r224, %r226, %p79;
392
+ .loc 2 123 29 // triton_helpers.py:123:29 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:37 ]
393
+ // begin inline asm
394
+ @%p14 st.shared.b32 [ %r111 + 0 ], %r113;
395
+ // end inline asm
396
+ bar.sync 0;
397
+ ld.shared.b32 %r227, [%r191];
398
+ ld.shared.b32 %r228, [%r191+32];
399
+ ld.shared.b32 %r229, [%r191+64];
400
+ ld.shared.b32 %r230, [%r191+96];
401
+ $L__tmp2:
402
+ .loc 1 38 40 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:38:40
403
+ bar.sync 0;
404
+ mad.lo.s32 %r3, %r144, -112, %r191;
405
+ mov.b64 %rd2, {%r229, %r230};
406
+ mov.b64 %rd1, {%r227, %r228};
407
+ st.shared.v2.b64 [%r3], {%rd1, %rd2};
408
+ bar.sync 0;
409
+ shl.b32 %r231, %r143, 7;
410
+ shl.b32 %r232, %r2, 4;
411
+ or.b32 %r233, %r231, %r232;
412
+ and.b32 %r234, %r233, 496;
413
+ add.s32 %r457, %r190, %r234;
414
+ // begin inline asm
415
+ ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%r459}, [%r457];
416
+ // end inline asm
417
+ .loc 1 40 19 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:40:19
418
+ setp.eq.f32 %p81, %r227, 0fFF800000;
419
+ setp.eq.f32 %p82, %r228, 0fFF800000;
420
+ setp.eq.f32 %p83, %r229, 0fFF800000;
421
+ setp.eq.f32 %p84, %r230, 0fFF800000;
422
+ .loc 1 41 18 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:41:18
423
+ sub.f32 %r235, %r26, %r227;
424
+ sub.f32 %r236, %r27, %r228;
425
+ sub.f32 %r237, %r28, %r229;
426
+ sub.f32 %r238, %r29, %r230;
427
+ sub.f32 %r239, %r34, %r227;
428
+ sub.f32 %r240, %r35, %r228;
429
+ sub.f32 %r241, %r36, %r229;
430
+ sub.f32 %r242, %r37, %r230;
431
+ sub.f32 %r243, %r42, %r227;
432
+ sub.f32 %r244, %r43, %r228;
433
+ sub.f32 %r245, %r44, %r229;
434
+ sub.f32 %r246, %r45, %r230;
435
+ sub.f32 %r247, %r50, %r227;
436
+ sub.f32 %r248, %r51, %r228;
437
+ sub.f32 %r249, %r52, %r229;
438
+ sub.f32 %r250, %r53, %r230;
439
+ .loc 1 43 33 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:43:33
440
+ selp.f32 %r251, 0f00000000, %r235, %p81;
441
+ selp.f32 %r252, 0f00000000, %r236, %p82;
442
+ selp.f32 %r253, 0f00000000, %r237, %p83;
443
+ selp.f32 %r254, 0f00000000, %r238, %p84;
444
+ selp.f32 %r255, 0f00000000, %r239, %p81;
445
+ selp.f32 %r256, 0f00000000, %r240, %p82;
446
+ selp.f32 %r257, 0f00000000, %r241, %p83;
447
+ selp.f32 %r258, 0f00000000, %r242, %p84;
448
+ selp.f32 %r259, 0f00000000, %r243, %p81;
449
+ selp.f32 %r260, 0f00000000, %r244, %p82;
450
+ selp.f32 %r261, 0f00000000, %r245, %p83;
451
+ selp.f32 %r262, 0f00000000, %r246, %p84;
452
+ selp.f32 %r263, 0f00000000, %r247, %p81;
453
+ selp.f32 %r264, 0f00000000, %r248, %p82;
454
+ selp.f32 %r265, 0f00000000, %r249, %p83;
455
+ selp.f32 %r266, 0f00000000, %r250, %p84;
456
+ .loc 1 44 27 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:44:27
457
+ ex2.approx.ftz.f32 %r267, %r251;
458
+ ex2.approx.ftz.f32 %r268, %r252;
459
+ ex2.approx.ftz.f32 %r269, %r253;
460
+ ex2.approx.ftz.f32 %r270, %r254;
461
+ ex2.approx.ftz.f32 %r271, %r255;
462
+ ex2.approx.ftz.f32 %r272, %r256;
463
+ ex2.approx.ftz.f32 %r273, %r257;
464
+ ex2.approx.ftz.f32 %r274, %r258;
465
+ ex2.approx.ftz.f32 %r275, %r259;
466
+ ex2.approx.ftz.f32 %r276, %r260;
467
+ ex2.approx.ftz.f32 %r277, %r261;
468
+ ex2.approx.ftz.f32 %r278, %r262;
469
+ ex2.approx.ftz.f32 %r279, %r263;
470
+ ex2.approx.ftz.f32 %r280, %r264;
471
+ ex2.approx.ftz.f32 %r281, %r265;
472
+ ex2.approx.ftz.f32 %r282, %r266;
473
+ .loc 1 45 19 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:45:19
474
+ mul.f32 %r283, %r271, %r66;
475
+ mul.f32 %r284, %r272, %r67;
476
+ mul.f32 %r285, %r273, %r68;
477
+ mul.f32 %r286, %r274, %r69;
478
+ $L__tmp3:
479
+ .loc 3 291 36 // standard.py:291:36 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
480
+ bar.sync 0;
481
+ .loc 3 261 15 // standard.py:261:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
482
+ fma.rn.f32 %r287, %r267, %r58, %r283;
483
+ fma.rn.f32 %r288, %r268, %r59, %r284;
484
+ fma.rn.f32 %r289, %r269, %r60, %r285;
485
+ fma.rn.f32 %r290, %r270, %r61, %r286;
486
+ fma.rn.f32 %r291, %r275, %r74, %r287;
487
+ fma.rn.f32 %r292, %r276, %r75, %r288;
488
+ fma.rn.f32 %r293, %r277, %r76, %r289;
489
+ fma.rn.f32 %r294, %r278, %r77, %r290;
490
+ fma.rn.f32 %r295, %r279, %r82, %r291;
491
+ fma.rn.f32 %r296, %r280, %r83, %r292;
492
+ fma.rn.f32 %r297, %r281, %r84, %r293;
493
+ fma.rn.f32 %r298, %r282, %r85, %r294;
494
+ selp.b32 %r117, %r295, 0, %p1;
495
+ .loc 3 291 36 // standard.py:291:36 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
496
+ // begin inline asm
497
+ @%p9 st.shared.b32 [ %r90 + 0 ], %r117;
498
+ // end inline asm
499
+ .loc 3 261 15 // standard.py:261:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
500
+ selp.b32 %r119, %r296, 0, %p1;
501
+ .loc 3 291 36 // standard.py:291:36 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
502
+ // begin inline asm
503
+ @%p9 st.shared.b32 [ %r92 + 0 ], %r119;
504
+ // end inline asm
505
+ .loc 3 261 15 // standard.py:261:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
506
+ selp.b32 %r121, %r297, 0, %p1;
507
+ .loc 3 291 36 // standard.py:291:36 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
508
+ // begin inline asm
509
+ @%p9 st.shared.b32 [ %r94 + 0 ], %r121;
510
+ // end inline asm
511
+ .loc 3 261 15 // standard.py:261:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
512
+ selp.b32 %r123, %r298, 0, %p1;
513
+ .loc 3 291 36 // standard.py:291:36 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
514
+ // begin inline asm
515
+ @%p9 st.shared.b32 [ %r96 + 0 ], %r123;
516
+ // end inline asm
517
+ bar.sync 0;
518
+ // begin inline asm
519
+ @%p9 ld.shared.b32 %r124, [ %r99 + 0 ];
520
+ // end inline asm
521
+ shfl.sync.bfly.b32 %r299, %r124, 4, 31, -1;
522
+ .loc 3 261 15 // standard.py:261:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
523
+ add.f32 %r300, %r124, %r299;
524
+ .loc 3 291 36 // standard.py:291:36 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
525
+ shfl.sync.bfly.b32 %r301, %r300, 2, 31, -1;
526
+ .loc 3 261 15 // standard.py:261:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
527
+ add.f32 %r302, %r300, %r301;
528
+ .loc 3 291 36 // standard.py:291:36 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
529
+ shfl.sync.bfly.b32 %r303, %r302, 1, 31, -1;
530
+ .loc 3 261 15 // standard.py:261:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
531
+ add.f32 %r127, %r302, %r303;
532
+ .loc 3 291 36 // standard.py:291:36 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
533
+ // begin inline asm
534
+ @%p14 st.shared.b32 [ %r99 + 0 ], %r127;
535
+ // end inline asm
536
+ // begin inline asm
537
+ @%p9 ld.shared.b32 %r128, [ %r103 + 0 ];
538
+ // end inline asm
539
+ shfl.sync.bfly.b32 %r304, %r128, 4, 31, -1;
540
+ .loc 3 261 15 // standard.py:261:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
541
+ add.f32 %r305, %r128, %r304;
542
+ .loc 3 291 36 // standard.py:291:36 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
543
+ shfl.sync.bfly.b32 %r306, %r305, 2, 31, -1;
544
+ .loc 3 261 15 // standard.py:261:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
545
+ add.f32 %r307, %r305, %r306;
546
+ .loc 3 291 36 // standard.py:291:36 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
547
+ shfl.sync.bfly.b32 %r308, %r307, 1, 31, -1;
548
+ .loc 3 261 15 // standard.py:261:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
549
+ add.f32 %r131, %r307, %r308;
550
+ .loc 3 291 36 // standard.py:291:36 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
551
+ // begin inline asm
552
+ @%p14 st.shared.b32 [ %r103 + 0 ], %r131;
553
+ // end inline asm
554
+ // begin inline asm
555
+ @%p9 ld.shared.b32 %r132, [ %r107 + 0 ];
556
+ // end inline asm
557
+ shfl.sync.bfly.b32 %r309, %r132, 4, 31, -1;
558
+ .loc 3 261 15 // standard.py:261:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
559
+ add.f32 %r310, %r132, %r309;
560
+ .loc 3 291 36 // standard.py:291:36 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
561
+ shfl.sync.bfly.b32 %r311, %r310, 2, 31, -1;
562
+ .loc 3 261 15 // standard.py:261:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
563
+ add.f32 %r312, %r310, %r311;
564
+ .loc 3 291 36 // standard.py:291:36 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
565
+ shfl.sync.bfly.b32 %r313, %r312, 1, 31, -1;
566
+ .loc 3 261 15 // standard.py:261:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
567
+ add.f32 %r135, %r312, %r313;
568
+ .loc 3 291 36 // standard.py:291:36 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
569
+ // begin inline asm
570
+ @%p14 st.shared.b32 [ %r107 + 0 ], %r135;
571
+ // end inline asm
572
+ // begin inline asm
573
+ @%p9 ld.shared.b32 %r136, [ %r111 + 0 ];
574
+ // end inline asm
575
+ shfl.sync.bfly.b32 %r314, %r136, 4, 31, -1;
576
+ .loc 3 261 15 // standard.py:261:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
577
+ add.f32 %r315, %r136, %r314;
578
+ .loc 3 291 36 // standard.py:291:36 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
579
+ shfl.sync.bfly.b32 %r316, %r315, 2, 31, -1;
580
+ .loc 3 261 15 // standard.py:261:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
581
+ add.f32 %r317, %r315, %r316;
582
+ .loc 3 291 36 // standard.py:291:36 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
583
+ shfl.sync.bfly.b32 %r318, %r317, 1, 31, -1;
584
+ .loc 3 261 15 // standard.py:261:15 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
585
+ add.f32 %r139, %r317, %r318;
586
+ .loc 3 291 36 // standard.py:291:36 @[ cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:26 ]
587
+ // begin inline asm
588
+ @%p14 st.shared.b32 [ %r111 + 0 ], %r139;
589
+ // end inline asm
590
+ bar.sync 0;
591
+ ld.shared.b32 %r319, [%r191];
592
+ ld.shared.b32 %r320, [%r191+32];
593
+ ld.shared.b32 %r321, [%r191+64];
594
+ ld.shared.b32 %r322, [%r191+96];
595
+ $L__tmp4:
596
+ .loc 1 48 29 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:48:29
597
+ bar.sync 0;
598
+ st.shared.v4.b32 [%r3], {%r319, %r320, %r321, %r322};
599
+ bar.sync 0;
600
+ // begin inline asm
601
+ ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%r460}, [%r457];
602
+ // end inline asm
603
+ .loc 1 50 34 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:50:34
604
+ selp.f32 %r323, 0f3F800000, %r319, %p81;
605
+ selp.f32 %r7, 0f3F800000, %r320, %p82;
606
+ .loc 1 51 27 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:51:27
607
+ setp.lt.f32 %p85, %r323, 0f00800000;
608
+ mul.f32 %r324, %r323, 0f4B000000;
609
+ selp.f32 %r10, %r324, %r323, %p85;
610
+ selp.f32 %r325, 0fC1B80000, 0f00000000, %p85;
611
+ add.s32 %r326, %r10, -1060439283;
612
+ and.b32 %r327, %r326, -8388608;
613
+ sub.s32 %r328, %r10, %r327;
614
+ cvt.rn.f32.s32 %r329, %r327;
615
+ mov.b32 %r330, 0f34000000;
616
+ fma.rn.ftz.f32 %r331, %r329, %r330, %r325;
617
+ add.f32 %r332, %r328, 0fBF800000;
618
+ mov.b32 %r333, 0fBE2C7F30;
619
+ mov.b32 %r334, 0f3DC6B27F;
620
+ fma.rn.ftz.f32 %r335, %r334, %r332, %r333;
621
+ mov.b32 %r336, 0f3E2FCF2A;
622
+ fma.rn.ftz.f32 %r337, %r335, %r332, %r336;
623
+ mov.b32 %r338, 0fBE374E43;
624
+ fma.rn.ftz.f32 %r339, %r337, %r332, %r338;
625
+ mov.b32 %r340, 0f3E520BF4;
626
+ fma.rn.ftz.f32 %r341, %r339, %r332, %r340;
627
+ mov.b32 %r342, 0fBE763C8B;
628
+ fma.rn.ftz.f32 %r343, %r341, %r332, %r342;
629
+ mov.b32 %r344, 0f3E93BF99;
630
+ fma.rn.ftz.f32 %r345, %r343, %r332, %r344;
631
+ mov.b32 %r346, 0fBEB8AA49;
632
+ fma.rn.ftz.f32 %r347, %r345, %r332, %r346;
633
+ mov.b32 %r348, 0f3EF6384A;
634
+ fma.rn.ftz.f32 %r349, %r347, %r332, %r348;
635
+ mov.b32 %r350, 0fBF38AA3B;
636
+ fma.rn.ftz.f32 %r351, %r349, %r332, %r350;
637
+ mul.f32 %r352, %r332, %r351;
638
+ mul.f32 %r353, %r332, %r352;
639
+ mov.b32 %r354, 0f3FB8AA3B;
640
+ fma.rn.ftz.f32 %r355, %r332, %r354, %r353;
641
+ add.f32 %r480, %r331, %r355;
642
+ setp.lt.u32 %p86, %r10, 2139095040;
643
+ mov.b32 %r356, 0f7F800000;
644
+ @%p86 bra $L__BB0_2;
645
+ // %bb.1: // %__nv_fmaf_rn.exit.i.i
646
+ .loc 1 0 27 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:0:27
647
+ fma.rn.ftz.f32 %r480, %r10, %r356, %r356;
648
+ $L__BB0_2: // %__nv_log2f.exit
649
+ selp.f32 %r8, 0f3F800000, %r321, %p83;
650
+ .loc 1 51 27 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:51:27
651
+ setp.lt.f32 %p87, %r7, 0f00800000;
652
+ mul.f32 %r357, %r7, 0f4B000000;
653
+ selp.f32 %r14, %r357, %r7, %p87;
654
+ selp.f32 %r358, 0fC1B80000, 0f00000000, %p87;
655
+ add.s32 %r359, %r14, -1060439283;
656
+ and.b32 %r360, %r359, -8388608;
657
+ sub.s32 %r361, %r14, %r360;
658
+ cvt.rn.f32.s32 %r362, %r360;
659
+ fma.rn.ftz.f32 %r364, %r362, %r330, %r358;
660
+ add.f32 %r365, %r361, 0fBF800000;
661
+ fma.rn.ftz.f32 %r368, %r334, %r365, %r333;
662
+ fma.rn.ftz.f32 %r370, %r368, %r365, %r336;
663
+ fma.rn.ftz.f32 %r372, %r370, %r365, %r338;
664
+ fma.rn.ftz.f32 %r374, %r372, %r365, %r340;
665
+ fma.rn.ftz.f32 %r376, %r374, %r365, %r342;
666
+ fma.rn.ftz.f32 %r378, %r376, %r365, %r344;
667
+ fma.rn.ftz.f32 %r380, %r378, %r365, %r346;
668
+ fma.rn.ftz.f32 %r382, %r380, %r365, %r348;
669
+ fma.rn.ftz.f32 %r384, %r382, %r365, %r350;
670
+ mul.f32 %r385, %r365, %r384;
671
+ mul.f32 %r386, %r365, %r385;
672
+ fma.rn.ftz.f32 %r388, %r365, %r354, %r386;
673
+ add.f32 %r481, %r364, %r388;
674
+ setp.lt.u32 %p88, %r14, 2139095040;
675
+ @%p88 bra $L__BB0_4;
676
+ // %bb.3: // %__nv_fmaf_rn.exit.i.i74
677
+ .loc 1 0 27 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:0:27
678
+ fma.rn.ftz.f32 %r481, %r14, %r356, %r356;
679
+ $L__BB0_4: // %__nv_log2f.exit77
680
+ selp.f32 %r9, 0f3F800000, %r322, %p84;
681
+ .loc 1 51 27 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:51:27
682
+ setp.lt.f32 %p89, %r8, 0f00800000;
683
+ mul.f32 %r390, %r8, 0f4B000000;
684
+ selp.f32 %r18, %r390, %r8, %p89;
685
+ selp.f32 %r391, 0fC1B80000, 0f00000000, %p89;
686
+ add.s32 %r392, %r18, -1060439283;
687
+ and.b32 %r393, %r392, -8388608;
688
+ sub.s32 %r394, %r18, %r393;
689
+ cvt.rn.f32.s32 %r395, %r393;
690
+ mov.b32 %r396, 0f34000000;
691
+ fma.rn.ftz.f32 %r397, %r395, %r396, %r391;
692
+ add.f32 %r398, %r394, 0fBF800000;
693
+ mov.b32 %r399, 0fBE2C7F30;
694
+ mov.b32 %r400, 0f3DC6B27F;
695
+ fma.rn.ftz.f32 %r401, %r400, %r398, %r399;
696
+ mov.b32 %r402, 0f3E2FCF2A;
697
+ fma.rn.ftz.f32 %r403, %r401, %r398, %r402;
698
+ mov.b32 %r404, 0fBE374E43;
699
+ fma.rn.ftz.f32 %r405, %r403, %r398, %r404;
700
+ mov.b32 %r406, 0f3E520BF4;
701
+ fma.rn.ftz.f32 %r407, %r405, %r398, %r406;
702
+ mov.b32 %r408, 0fBE763C8B;
703
+ fma.rn.ftz.f32 %r409, %r407, %r398, %r408;
704
+ mov.b32 %r410, 0f3E93BF99;
705
+ fma.rn.ftz.f32 %r411, %r409, %r398, %r410;
706
+ mov.b32 %r412, 0fBEB8AA49;
707
+ fma.rn.ftz.f32 %r413, %r411, %r398, %r412;
708
+ mov.b32 %r414, 0f3EF6384A;
709
+ fma.rn.ftz.f32 %r415, %r413, %r398, %r414;
710
+ mov.b32 %r416, 0fBF38AA3B;
711
+ fma.rn.ftz.f32 %r417, %r415, %r398, %r416;
712
+ mul.f32 %r418, %r398, %r417;
713
+ mul.f32 %r419, %r398, %r418;
714
+ mov.b32 %r420, 0f3FB8AA3B;
715
+ fma.rn.ftz.f32 %r421, %r398, %r420, %r419;
716
+ add.f32 %r482, %r397, %r421;
717
+ setp.lt.u32 %p90, %r18, 2139095040;
718
+ mov.b32 %r422, 0f7F800000;
719
+ @%p90 bra $L__BB0_6;
720
+ // %bb.5: // %__nv_fmaf_rn.exit.i.i104
721
+ .loc 1 0 27 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:0:27
722
+ fma.rn.ftz.f32 %r482, %r18, %r422, %r422;
723
+ $L__BB0_6: // %__nv_log2f.exit107
724
+ ld.param.b64 %rd5, [triton_per_fused_mul_1_param_4];
725
+ ld.param.b64 %rd4, [triton_per_fused_mul_1_param_3];
726
+ ld.param.b64 %rd3, [triton_per_fused_mul_1_param_2];
727
+ .loc 1 51 27 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:51:27
728
+ setp.lt.f32 %p91, %r9, 0f00800000;
729
+ mul.f32 %r423, %r9, 0f4B000000;
730
+ selp.f32 %r22, %r423, %r9, %p91;
731
+ selp.f32 %r424, 0fC1B80000, 0f00000000, %p91;
732
+ add.s32 %r425, %r22, -1060439283;
733
+ and.b32 %r426, %r425, -8388608;
734
+ sub.s32 %r427, %r22, %r426;
735
+ cvt.rn.f32.s32 %r428, %r426;
736
+ fma.rn.ftz.f32 %r430, %r428, %r396, %r424;
737
+ add.f32 %r431, %r427, 0fBF800000;
738
+ fma.rn.ftz.f32 %r434, %r400, %r431, %r399;
739
+ fma.rn.ftz.f32 %r436, %r434, %r431, %r402;
740
+ fma.rn.ftz.f32 %r438, %r436, %r431, %r404;
741
+ fma.rn.ftz.f32 %r440, %r438, %r431, %r406;
742
+ fma.rn.ftz.f32 %r442, %r440, %r431, %r408;
743
+ fma.rn.ftz.f32 %r444, %r442, %r431, %r410;
744
+ fma.rn.ftz.f32 %r446, %r444, %r431, %r412;
745
+ fma.rn.ftz.f32 %r448, %r446, %r431, %r414;
746
+ fma.rn.ftz.f32 %r450, %r448, %r431, %r416;
747
+ mul.f32 %r451, %r431, %r450;
748
+ mul.f32 %r452, %r431, %r451;
749
+ fma.rn.ftz.f32 %r454, %r431, %r420, %r452;
750
+ add.f32 %r483, %r430, %r454;
751
+ setp.lt.u32 %p92, %r22, 2139095040;
752
+ @%p92 bra $L__BB0_8;
753
+ // %bb.7: // %__nv_fmaf_rn.exit.i.i134
754
+ .loc 1 0 27 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:0:27
755
+ fma.rn.ftz.f32 %r483, %r22, %r422, %r422;
756
+ $L__BB0_8: // %__nv_log2f.exit137
757
+ .loc 1 51 27 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:51:27
758
+ setp.eq.f32 %p96, %r10, 0f00000000;
759
+ setp.eq.f32 %p97, %r14, 0f00000000;
760
+ setp.eq.f32 %p98, %r18, 0f00000000;
761
+ setp.eq.f32 %p99, %r22, 0f00000000;
762
+ .loc 1 25 44 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:25:44
763
+ and.b32 %r461, %r2, 127;
764
+ .loc 1 25 23 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:25:23
765
+ or.b32 %r462, %r1, %r461;
766
+ .loc 1 26 21 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:26:21
767
+ setp.lt.s32 %p100, %r462, 1536;
768
+ .loc 1 51 27 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:51:27
769
+ selp.f32 %r463, 0fFF800000, %r483, %p99;
770
+ selp.f32 %r464, 0fFF800000, %r482, %p98;
771
+ selp.f32 %r465, 0fFF800000, %r481, %p97;
772
+ selp.f32 %r466, 0fFF800000, %r480, %p96;
773
+ .loc 1 52 20 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:52:20
774
+ mov.b64 {%r467, %r468}, %rd1;
775
+ add.f32 %r469, %r467, %r466;
776
+ add.f32 %r470, %r468, %r465;
777
+ mov.b64 {%r471, %r472}, %rd2;
778
+ add.f32 %r473, %r471, %r464;
779
+ add.f32 %r474, %r472, %r463;
780
+ .loc 1 54 20 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:54:20
781
+ mul.f32 %r475, %r474, 0f3F317218;
782
+ mul.f32 %r476, %r473, 0f3F317218;
783
+ mul.f32 %r477, %r470, 0f3F317218;
784
+ mul.f32 %r478, %r469, 0f3F317218;
785
+ .loc 1 55 25 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:55:25
786
+ mul.wide.s32 %rd23, %r462, 4;
787
+ add.s64 %rd20, %rd5, %rd23;
788
+ .loc 1 55 37 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:55:37
789
+ bar.sync 0;
790
+ st.shared.v4.b32 [%r3], {%r478, %r477, %r476, %r475};
791
+ bar.sync 0;
792
+ // begin inline asm
793
+ ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%r458}, [%r457];
794
+ // end inline asm
795
+ and.b32 %r479, %r2, 128;
796
+ setp.eq.b32 %p101, %r479, 0;
797
+ and.pred %p93, %p101, %p100;
798
+ // begin inline asm
799
+ @%p93 st.global.b32 [ %rd20 + 0 ], { %r458 };
800
+ // end inline asm
801
+ .loc 1 56 25 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:56:25
802
+ add.s64 %rd21, %rd3, %rd23;
803
+ .loc 1 56 36 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:56:36
804
+ // begin inline asm
805
+ @%p93 st.global.b32 [ %rd21 + 0 ], { %r459 };
806
+ // end inline asm
807
+ .loc 1 57 25 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:57:25
808
+ add.s64 %rd22, %rd4, %rd23;
809
+ .loc 1 57 37 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:57:37
810
+ // begin inline asm
811
+ @%p93 st.global.b32 [ %rd22 + 0 ], { %r460 };
812
+ // end inline asm
813
+ .loc 1 57 4 // cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py:57:4
814
+ ret;
815
+ $L__tmp5:
816
+ $L__func_end0:
817
+ // -- End function
818
+ }
819
+ .file 1 "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py"
820
+ .file 2 "/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py"
821
+ .file 3 "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py"
822
+ .section .debug_abbrev
823
+ {
824
+ .b8 1 // Abbreviation Code
825
+ .b8 17 // DW_TAG_compile_unit
826
+ .b8 1 // DW_CHILDREN_yes
827
+ .b8 37 // DW_AT_producer
828
+ .b8 8 // DW_FORM_string
829
+ .b8 19 // DW_AT_language
830
+ .b8 5 // DW_FORM_data2
831
+ .b8 3 // DW_AT_name
832
+ .b8 8 // DW_FORM_string
833
+ .b8 16 // DW_AT_stmt_list
834
+ .b8 6 // DW_FORM_data4
835
+ .b8 27 // DW_AT_comp_dir
836
+ .b8 8 // DW_FORM_string
837
+ .b8 0 // EOM(1)
838
+ .b8 0 // EOM(2)
839
+ .b8 2 // Abbreviation Code
840
+ .b8 46 // DW_TAG_subprogram
841
+ .b8 0 // DW_CHILDREN_no
842
+ .b8 3 // DW_AT_name
843
+ .b8 8 // DW_FORM_string
844
+ .b8 32 // DW_AT_inline
845
+ .b8 11 // DW_FORM_data1
846
+ .b8 0 // EOM(1)
847
+ .b8 0 // EOM(2)
848
+ .b8 3 // Abbreviation Code
849
+ .b8 46 // DW_TAG_subprogram
850
+ .b8 1 // DW_CHILDREN_yes
851
+ .b8 17 // DW_AT_low_pc
852
+ .b8 1 // DW_FORM_addr
853
+ .b8 18 // DW_AT_high_pc
854
+ .b8 1 // DW_FORM_addr
855
+ .b8 49 // DW_AT_abstract_origin
856
+ .b8 19 // DW_FORM_ref4
857
+ .b8 0 // EOM(1)
858
+ .b8 0 // EOM(2)
859
+ .b8 4 // Abbreviation Code
860
+ .b8 29 // DW_TAG_inlined_subroutine
861
+ .b8 0 // DW_CHILDREN_no
862
+ .b8 49 // DW_AT_abstract_origin
863
+ .b8 19 // DW_FORM_ref4
864
+ .b8 17 // DW_AT_low_pc
865
+ .b8 1 // DW_FORM_addr
866
+ .b8 18 // DW_AT_high_pc
867
+ .b8 1 // DW_FORM_addr
868
+ .b8 88 // DW_AT_call_file
869
+ .b8 11 // DW_FORM_data1
870
+ .b8 89 // DW_AT_call_line
871
+ .b8 11 // DW_FORM_data1
872
+ .b8 87 // DW_AT_call_column
873
+ .b8 11 // DW_FORM_data1
874
+ .b8 0 // EOM(1)
875
+ .b8 0 // EOM(2)
876
+ .b8 0 // EOM(3)
877
+ }
878
+ .section .debug_info
879
+ {
880
+ .b32 235 // Length of Unit
881
+ .b8 2 // DWARF version number
882
+ .b8 0
883
+ .b32 .debug_abbrev // Offset Into Abbrev. Section
884
+ .b8 8 // Address Size (in bytes)
885
+ .b8 1 // Abbrev [1] 0xb:0xe4 DW_TAG_compile_unit
886
+ .b8 116 // DW_AT_producer
887
+ .b8 114
888
+ .b8 105
889
+ .b8 116
890
+ .b8 111
891
+ .b8 110
892
+ .b8 0
893
+ .b8 2 // DW_AT_language
894
+ .b8 0
895
+ .b8 99 // DW_AT_name
896
+ .b8 114
897
+ .b8 53
898
+ .b8 103
899
+ .b8 105
900
+ .b8 109
901
+ .b8 114
902
+ .b8 116
903
+ .b8 113
904
+ .b8 102
905
+ .b8 53
906
+ .b8 103
907
+ .b8 113
908
+ .b8 109
909
+ .b8 103
910
+ .b8 107
911
+ .b8 103
912
+ .b8 105
913
+ .b8 114
914
+ .b8 122
915
+ .b8 54
916
+ .b8 110
917
+ .b8 101
918
+ .b8 98
919
+ .b8 112
920
+ .b8 52
921
+ .b8 105
922
+ .b8 106
923
+ .b8 119
924
+ .b8 116
925
+ .b8 98
926
+ .b8 122
927
+ .b8 104
928
+ .b8 109
929
+ .b8 113
930
+ .b8 108
931
+ .b8 115
932
+ .b8 100
933
+ .b8 103
934
+ .b8 106
935
+ .b8 113
936
+ .b8 100
937
+ .b8 97
938
+ .b8 51
939
+ .b8 51
940
+ .b8 100
941
+ .b8 114
942
+ .b8 52
943
+ .b8 51
944
+ .b8 113
945
+ .b8 114
946
+ .b8 101
947
+ .b8 46
948
+ .b8 112
949
+ .b8 121
950
+ .b8 0
951
+ .b32 .debug_line // DW_AT_stmt_list
952
+ .b8 47 // DW_AT_comp_dir
953
+ .b8 119
954
+ .b8 111
955
+ .b8 114
956
+ .b8 107
957
+ .b8 115
958
+ .b8 112
959
+ .b8 97
960
+ .b8 99
961
+ .b8 101
962
+ .b8 47
963
+ .b8 104
964
+ .b8 97
965
+ .b8 110
966
+ .b8 114
967
+ .b8 117
968
+ .b8 105
969
+ .b8 47
970
+ .b8 106
971
+ .b8 117
972
+ .b8 110
973
+ .b8 113
974
+ .b8 117
975
+ .b8 97
976
+ .b8 110
977
+ .b8 47
978
+ .b8 83
979
+ .b8 112
980
+ .b8 101
981
+ .b8 99
982
+ .b8 70
983
+ .b8 111
984
+ .b8 114
985
+ .b8 103
986
+ .b8 101
987
+ .b8 47
988
+ .b8 99
989
+ .b8 97
990
+ .b8 99
991
+ .b8 104
992
+ .b8 101
993
+ .b8 47
994
+ .b8 99
995
+ .b8 111
996
+ .b8 109
997
+ .b8 112
998
+ .b8 105
999
+ .b8 108
1000
+ .b8 101
1001
+ .b8 100
1002
+ .b8 95
1003
+ .b8 107
1004
+ .b8 101
1005
+ .b8 114
1006
+ .b8 110
1007
+ .b8 101
1008
+ .b8 108
1009
+ .b8 115
1010
+ .b8 47
1011
+ .b8 114
1012
+ .b8 53
1013
+ .b8 0
1014
+ .b8 2 // Abbrev [2] 0x8f:0x19 DW_TAG_subprogram
1015
+ .b8 116 // DW_AT_name
1016
+ .b8 114
1017
+ .b8 105
1018
+ .b8 116
1019
+ .b8 111
1020
+ .b8 110
1021
+ .b8 95
1022
+ .b8 112
1023
+ .b8 101
1024
+ .b8 114
1025
+ .b8 95
1026
+ .b8 102
1027
+ .b8 117
1028
+ .b8 115
1029
+ .b8 101
1030
+ .b8 100
1031
+ .b8 95
1032
+ .b8 109
1033
+ .b8 117
1034
+ .b8 108
1035
+ .b8 95
1036
+ .b8 49
1037
+ .b8 0
1038
+ .b8 1 // DW_AT_inline
1039
+ .b8 3 // Abbrev [3] 0xa8:0x46 DW_TAG_subprogram
1040
+ .b64 $L__func_begin0 // DW_AT_low_pc
1041
+ .b64 $L__func_end0 // DW_AT_high_pc
1042
+ .b32 143 // DW_AT_abstract_origin
1043
+ .b8 4 // Abbrev [4] 0xbd:0x18 DW_TAG_inlined_subroutine
1044
+ .b32 143 // DW_AT_abstract_origin
1045
+ .b64 $L__tmp1 // DW_AT_low_pc
1046
+ .b64 $L__tmp2 // DW_AT_high_pc
1047
+ .b8 1 // DW_AT_call_file
1048
+ .b8 38 // DW_AT_call_line
1049
+ .b8 37 // DW_AT_call_column
1050
+ .b8 4 // Abbrev [4] 0xd5:0x18 DW_TAG_inlined_subroutine
1051
+ .b32 143 // DW_AT_abstract_origin
1052
+ .b64 $L__tmp3 // DW_AT_low_pc
1053
+ .b64 $L__tmp4 // DW_AT_high_pc
1054
+ .b8 1 // DW_AT_call_file
1055
+ .b8 48 // DW_AT_call_line
1056
+ .b8 26 // DW_AT_call_column
1057
+ .b8 0 // End Of Children Mark
1058
+ .b8 0 // End Of Children Mark
1059
+ }
1060
+ .section .debug_macinfo { }
progress/SpecForge/cache/compiled_kernels/triton/3/2GPX4H2RCLQEV2LIJWPK5V4BGHUBOBJI3FUMPIZKQJ7CMB34OTAA/triton_per_fused_mul_1.source ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":18:0)
2
+ #loc47 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":122:0)
3
+ #loc49 = loc(unknown)
4
+ #loc52 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":109:0)
5
+ #loc61 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0)
6
+ #loc65 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0)
7
+ #loc74 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0)
8
+ #loc78 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0)
9
+ #loc82 = loc("in_ptr0"(#loc))
10
+ #loc83 = loc("in_ptr1"(#loc))
11
+ #loc84 = loc("out_ptr0"(#loc))
12
+ #loc85 = loc("out_ptr1"(#loc))
13
+ #loc86 = loc("out_ptr2"(#loc))
14
+ #loc87 = loc("xnumel"(#loc))
15
+ #loc88 = loc("r0_numel"(#loc))
16
+ #loc128 = loc("a"(#loc47))
17
+ #loc129 = loc("a"(#loc52))
18
+ #loc130 = loc("b"(#loc52))
19
+ #loc134 = loc("x"(#loc61))
20
+ #loc135 = loc("x"(#loc65))
21
+ #loc136 = loc("input"(#loc74))
22
+ #loc137 = loc("a"(#loc78))
23
+ #loc138 = loc("b"(#loc78))
24
+ module {
25
+ tt.func public @triton_per_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
26
+ %xnumel_0 = arith.constant 1536 : i32 loc(#loc89)
27
+ %r0_numel_1 = arith.constant 32 : i32 loc(#loc90)
28
+ %xoffset = tt.get_program_id x : i32 loc(#loc91)
29
+ %xoffset_2 = arith.constant 128 : i32 loc(#loc92)
30
+ %xoffset_3 = arith.constant 128 : i32 loc(#loc92)
31
+ %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc92)
32
+ %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc93)
33
+ %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc94)
34
+ %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<128x1xi32> loc(#loc95)
35
+ %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<128x1xi32> loc(#loc95)
36
+ %xmask = arith.constant dense<1536> : tensor<128x1xi32> loc(#loc96)
37
+ %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<128x1xi32> loc(#loc96)
38
+ %r0_index = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc97)
39
+ %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc98)
40
+ %r0_offset = arith.constant 0 : i32 loc(#loc99)
41
+ %r0_mask = arith.constant true loc(#loc100)
42
+ %r0_mask_10 = arith.constant dense<true> : tensor<128x32xi1> loc(#loc100)
43
+ %tmp0 = arith.constant 1536 : i32 loc(#loc101)
44
+ %tmp0_11 = arith.constant 1536 : i32 loc(#loc101)
45
+ %tmp0_12 = arith.constant dense<1536> : tensor<1x32xi32> loc(#loc101)
46
+ %tmp0_13 = arith.muli %tmp0_12, %r0_index_9 : tensor<1x32xi32> loc(#loc101)
47
+ %tmp0_14 = tt.broadcast %xindex_7 : tensor<128x1xi32> -> tensor<128x32xi32> loc(#loc102)
48
+ %tmp0_15 = tt.broadcast %tmp0_13 : tensor<1x32xi32> -> tensor<128x32xi32> loc(#loc102)
49
+ %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<128x32xi32> loc(#loc102)
50
+ %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x32x!tt.ptr<f32>> loc(#loc103)
51
+ %tmp0_18 = tt.addptr %tmp0_17, %tmp0_16 : tensor<128x32x!tt.ptr<f32>>, tensor<128x32xi32> loc(#loc103)
52
+ %tmp0_19 = arith.constant 0.000000e+00 : f32 loc(#loc104)
53
+ %tmp0_20 = tt.broadcast %xmask_8 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc104)
54
+ %tmp0_21 = arith.constant dense<0.000000e+00> : tensor<128x32xf32> loc(#loc104)
55
+ %tmp0_22 = tt.load %tmp0_18, %tmp0_20, %tmp0_21 : tensor<128x32x!tt.ptr<f32>> loc(#loc104)
56
+ %tmp5 = arith.constant 1536 : i32 loc(#loc105)
57
+ %tmp5_23 = arith.constant 1536 : i32 loc(#loc105)
58
+ %tmp5_24 = arith.constant dense<1536> : tensor<1x32xi32> loc(#loc105)
59
+ %tmp5_25 = arith.muli %tmp5_24, %r0_index_9 : tensor<1x32xi32> loc(#loc105)
60
+ %tmp5_26 = tt.broadcast %xindex_7 : tensor<128x1xi32> -> tensor<128x32xi32> loc(#loc106)
61
+ %tmp5_27 = tt.broadcast %tmp5_25 : tensor<1x32xi32> -> tensor<128x32xi32> loc(#loc106)
62
+ %tmp5_28 = arith.addi %tmp5_26, %tmp5_27 : tensor<128x32xi32> loc(#loc106)
63
+ %tmp5_29 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<128x32x!tt.ptr<f32>> loc(#loc107)
64
+ %tmp5_30 = tt.addptr %tmp5_29, %tmp5_28 : tensor<128x32x!tt.ptr<f32>>, tensor<128x32xi32> loc(#loc107)
65
+ %tmp5_31 = arith.constant 0.000000e+00 : f32 loc(#loc108)
66
+ %tmp5_32 = tt.broadcast %xmask_8 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc108)
67
+ %tmp5_33 = arith.constant dense<0.000000e+00> : tensor<128x32xf32> loc(#loc108)
68
+ %tmp5_34 = tt.load %tmp5_30, %tmp5_32, %tmp5_33 : tensor<128x32x!tt.ptr<f32>> loc(#loc108)
69
+ %tmp3 = arith.constant 0xFF800000 : f32 loc(#loc109)
70
+ %tmp3_35 = arith.constant 0xFF800000 : f32 loc(#loc109)
71
+ %tmp3_36 = arith.constant dense<0xFF800000> : tensor<128x32xf32> loc(#loc109)
72
+ %tmp3_37 = tt.broadcast %xmask_8 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc109)
73
+ %tmp3_38 = arith.select %tmp3_37, %tmp0_22, %tmp3_36 : tensor<128x32xi1>, tensor<128x32xf32> loc(#loc109)
74
+ %tmp4 = tt.call @"torch._inductor.runtime.triton_helpers.max2__fp32S128_32S__(1,)cconstexpr_1_"(%tmp3_38) : (tensor<128x32xf32>) -> tensor<128xf32> loc(#loc110)
75
+ %tmp4_39 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc111)
76
+ %tmp6 = arith.constant 0xFF800000 : f32 loc(#loc112)
77
+ %tmp7 = arith.constant dense<0xFF800000> : tensor<128x1xf32> loc(#loc113)
78
+ %tmp7_40 = arith.cmpf oeq, %tmp4_39, %tmp7 : tensor<128x1xf32> loc(#loc113)
79
+ %tmp8 = tt.broadcast %tmp4_39 : tensor<128x1xf32> -> tensor<128x32xf32> loc(#loc114)
80
+ %tmp8_41 = arith.subf %tmp0_22, %tmp8 : tensor<128x32xf32> loc(#loc114)
81
+ %tmp9 = arith.constant 0.000000e+00 : f32 loc(#loc115)
82
+ %tmp10 = arith.constant dense<0.000000e+00> : tensor<128x32xf32> loc(#loc116)
83
+ %tmp10_42 = tt.broadcast %tmp7_40 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc116)
84
+ %tmp10_43 = arith.select %tmp10_42, %tmp10, %tmp8_41 : tensor<128x32xi1>, tensor<128x32xf32> loc(#loc116)
85
+ %tmp11 = tt.extern_elementwise %tmp10_43 {libname = "", libpath = "", pure = true, symbol = "__nv_exp2f"} : (tensor<128x32xf32>) -> tensor<128x32xf32> loc(#loc117)
86
+ %tmp12 = arith.mulf %tmp5_34, %tmp11 : tensor<128x32xf32> loc(#loc118)
87
+ %tmp15 = arith.constant 0 : i32 loc(#loc119)
88
+ %tmp15_44 = arith.constant 0.000000e+00 : f32 loc(#loc119)
89
+ %tmp15_45 = arith.constant dense<0.000000e+00> : tensor<128x32xf32> loc(#loc119)
90
+ %tmp15_46 = tt.broadcast %xmask_8 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc119)
91
+ %tmp15_47 = arith.select %tmp15_46, %tmp12, %tmp15_45 : tensor<128x32xi1>, tensor<128x32xf32> loc(#loc119)
92
+ %tmp16 = tt.call @"triton.language.standard.sum__fp32S128_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp15_47) : (tensor<128x32xf32>) -> tensor<128xf32> loc(#loc120)
93
+ %tmp16_48 = tt.expand_dims %tmp16 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc121)
94
+ %tmp17 = arith.constant 1.000000e+00 : f32 loc(#loc122)
95
+ %tmp18 = arith.constant dense<1.000000e+00> : tensor<128x1xf32> loc(#loc123)
96
+ %tmp18_49 = arith.select %tmp7_40, %tmp18, %tmp16_48 : tensor<128x1xi1>, tensor<128x1xf32> loc(#loc123)
97
+ %tmp19 = tt.extern_elementwise %tmp18_49 {libname = "", libpath = "", pure = true, symbol = "__nv_log2f"} : (tensor<128x1xf32>) -> tensor<128x1xf32> loc(#loc124)
98
+ %tmp20 = arith.addf %tmp19, %tmp4_39 : tensor<128x1xf32> loc(#loc125)
99
+ %tmp21 = arith.constant 0.693147182 : f32 loc(#loc126)
100
+ %tmp22 = arith.constant dense<0.693147182> : tensor<128x1xf32> loc(#loc127)
101
+ %tmp22_50 = arith.mulf %tmp20, %tmp22 : tensor<128x1xf32> loc(#loc127)
102
+ %0 = tt.splat %out_ptr2 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>> loc(#loc40)
103
+ %1 = tt.addptr %0, %xindex_7 : tensor<128x1x!tt.ptr<f32>>, tensor<128x1xi32> loc(#loc40)
104
+ tt.store %1, %tmp22_50, %xmask_8 : tensor<128x1x!tt.ptr<f32>> loc(#loc41)
105
+ %2 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>> loc(#loc42)
106
+ %3 = tt.addptr %2, %xindex_7 : tensor<128x1x!tt.ptr<f32>>, tensor<128x1xi32> loc(#loc42)
107
+ tt.store %3, %tmp4_39, %xmask_8 : tensor<128x1x!tt.ptr<f32>> loc(#loc43)
108
+ %4 = tt.splat %out_ptr1 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>> loc(#loc44)
109
+ %5 = tt.addptr %4, %xindex_7 : tensor<128x1x!tt.ptr<f32>>, tensor<128x1xi32> loc(#loc44)
110
+ tt.store %5, %tmp16_48, %xmask_8 : tensor<128x1x!tt.ptr<f32>> loc(#loc45)
111
+ tt.return loc(#loc46)
112
+ } loc(#loc)
113
+ tt.func private @"torch._inductor.runtime.triton_helpers.max2__fp32S128_32S__(1,)cconstexpr_1_"(%a: tensor<128x32xf32> loc("a"(#loc47))) -> tensor<128xf32> attributes {noinline = false} {
114
+ %0 = "tt.reduce"(%a) <{axis = 1 : i32}> ({
115
+ ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
116
+ %2 = tt.call @torch._inductor.runtime.triton_helpers.maximum__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc48)
117
+ tt.reduce.return %2 : f32 loc(#loc48)
118
+ }) : (tensor<128x32xf32>) -> tensor<128xf32> loc(#loc48)
119
+ tt.return %0 : tensor<128xf32> loc(#loc50)
120
+ ^bb1: // no predecessors
121
+ %1 = ub.poison : tensor<128xf32> loc(#loc51)
122
+ tt.return %1 : tensor<128xf32> loc(#loc51)
123
+ } loc(#loc47)
124
+ tt.func private @torch._inductor.runtime.triton_helpers.maximum__fp32_fp32__(%a: f32 loc("a"(#loc52)), %b: f32 loc("b"(#loc52))) -> f32 attributes {noinline = false} {
125
+ %mask = arith.cmpf ogt, %a, %b : f32 loc(#loc139)
126
+ %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a) : (f32) -> i1 loc(#loc54)
127
+ %1 = scf.if %0 -> (i1) {
128
+ %mask_0 = arith.cmpf une, %a, %a : f32 loc(#loc132)
129
+ %mask_1 = arith.ori %mask, %mask_0 : i1 loc(#loc140)
130
+ scf.yield %mask_1 : i1 loc(#loc140)
131
+ } else {
132
+ scf.yield %mask : i1 loc(#loc49)
133
+ } loc(#loc55)
134
+ %2 = arith.select %1, %a, %b : f32 loc(#loc58)
135
+ tt.return %2 : f32 loc(#loc59)
136
+ ^bb1: // no predecessors
137
+ %3 = ub.poison : f32 loc(#loc60)
138
+ tt.return %3 : f32 loc(#loc60)
139
+ } loc(#loc52)
140
+ tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc61))) -> i1 attributes {noinline = false} {
141
+ %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc62)
142
+ %true = arith.constant true loc(#loc63)
143
+ tt.return %true : i1 loc(#loc63)
144
+ ^bb1: // no predecessors
145
+ %1 = ub.poison : i1 loc(#loc64)
146
+ tt.return %1 : i1 loc(#loc64)
147
+ } loc(#loc61)
148
+ tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc65))) -> tensor<1xf32> attributes {noinline = false} {
149
+ %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc66)
150
+ %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc67)
151
+ %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc67)
152
+ %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc67)
153
+ tt.return %3 : tensor<1xf32> loc(#loc68)
154
+ ^bb1: // no predecessors
155
+ %4 = ub.poison : tensor<1xf32> loc(#loc69)
156
+ tt.return %4 : tensor<1xf32> loc(#loc69)
157
+ } loc(#loc65)
158
+ tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} {
159
+ %false = arith.constant false loc(#loc71)
160
+ %cst = arith.constant dense<false> : tensor<1xi1> loc(#loc71)
161
+ tt.return %cst : tensor<1xi1> loc(#loc72)
162
+ ^bb1: // no predecessors
163
+ %0 = ub.poison : tensor<1xi1> loc(#loc73)
164
+ tt.return %0 : tensor<1xi1> loc(#loc73)
165
+ } loc(#loc70)
166
+ tt.func private @"triton.language.standard.sum__fp32S128_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x32xf32> loc("input"(#loc74))) -> tensor<128xf32> attributes {noinline = false} {
167
+ %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
168
+ ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
169
+ %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc75)
170
+ tt.reduce.return %2 : f32 loc(#loc75)
171
+ }) : (tensor<128x32xf32>) -> tensor<128xf32> loc(#loc75)
172
+ tt.return %0 : tensor<128xf32> loc(#loc76)
173
+ ^bb1: // no predecessors
174
+ %1 = ub.poison : tensor<128xf32> loc(#loc77)
175
+ tt.return %1 : tensor<128xf32> loc(#loc77)
176
+ } loc(#loc74)
177
+ tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc78)), %b: f32 loc("b"(#loc78))) -> f32 attributes {noinline = false} {
178
+ %0 = arith.addf %a, %b : f32 loc(#loc79)
179
+ tt.return %0 : f32 loc(#loc80)
180
+ ^bb1: // no predecessors
181
+ %1 = ub.poison : f32 loc(#loc81)
182
+ tt.return %1 : f32 loc(#loc81)
183
+ } loc(#loc78)
184
+ } loc(#loc)
185
+ #loc1 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":19:13)
186
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":20:15)
187
+ #loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":24:28)
188
+ #loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":24:33)
189
+ #loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":25:36)
190
+ #loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":25:44)
191
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":25:23)
192
+ #loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":26:21)
193
+ #loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":27:28)
194
+ #loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":27:38)
195
+ #loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":28:16)
196
+ #loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":29:48)
197
+ #loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":34:40)
198
+ #loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":34:35)
199
+ #loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":34:30)
200
+ #loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":34:47)
201
+ #loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":35:40)
202
+ #loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":35:35)
203
+ #loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":35:30)
204
+ #loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":35:47)
205
+ #loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":37:33)
206
+ #loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":38:37)
207
+ #loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":38:40)
208
+ #loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":39:11)
209
+ #loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":40:19)
210
+ #loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":41:18)
211
+ #loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":42:11)
212
+ #loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":43:33)
213
+ #loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":44:27)
214
+ #loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":45:19)
215
+ #loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":47:35)
216
+ #loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":48:26)
217
+ #loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":48:29)
218
+ #loc34 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":49:12)
219
+ #loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":50:34)
220
+ #loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":51:27)
221
+ #loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":52:20)
222
+ #loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":53:12)
223
+ #loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":54:20)
224
+ #loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":55:25)
225
+ #loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":55:37)
226
+ #loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":56:25)
227
+ #loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":56:36)
228
+ #loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":57:25)
229
+ #loc45 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":57:37)
230
+ #loc46 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":57:4)
231
+ #loc48 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29)
232
+ #loc50 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:11)
233
+ #loc51 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:4)
234
+ #loc53 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15)
235
+ #loc54 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:19)
236
+ #loc55 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:7)
237
+ #loc56 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21)
238
+ #loc57 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16)
239
+ #loc58 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29)
240
+ #loc59 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:11)
241
+ #loc60 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:4)
242
+ #loc62 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29)
243
+ #loc63 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11)
244
+ #loc64 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4)
245
+ #loc66 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30)
246
+ #loc67 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15)
247
+ #loc68 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11)
248
+ #loc69 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4)
249
+ #loc70 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0)
250
+ #loc71 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31)
251
+ #loc72 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11)
252
+ #loc73 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4)
253
+ #loc75 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
254
+ #loc76 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11)
255
+ #loc77 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4)
256
+ #loc79 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
257
+ #loc80 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11)
258
+ #loc81 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4)
259
+ #loc89 = loc("xnumel"(#loc1))
260
+ #loc90 = loc("r0_numel"(#loc2))
261
+ #loc91 = loc("xoffset"(#loc3))
262
+ #loc92 = loc("xoffset"(#loc4))
263
+ #loc93 = loc("xindex"(#loc5))
264
+ #loc94 = loc("xindex"(#loc6))
265
+ #loc95 = loc("xindex"(#loc7))
266
+ #loc96 = loc("xmask"(#loc8))
267
+ #loc97 = loc("r0_index"(#loc9))
268
+ #loc98 = loc("r0_index"(#loc10))
269
+ #loc99 = loc("r0_offset"(#loc11))
270
+ #loc100 = loc("r0_mask"(#loc12))
271
+ #loc101 = loc("tmp0"(#loc13))
272
+ #loc102 = loc("tmp0"(#loc14))
273
+ #loc103 = loc("tmp0"(#loc15))
274
+ #loc104 = loc("tmp0"(#loc16))
275
+ #loc105 = loc("tmp5"(#loc17))
276
+ #loc106 = loc("tmp5"(#loc18))
277
+ #loc107 = loc("tmp5"(#loc19))
278
+ #loc108 = loc("tmp5"(#loc20))
279
+ #loc109 = loc("tmp3"(#loc21))
280
+ #loc110 = loc("tmp4"(#loc22))
281
+ #loc111 = loc("tmp4"(#loc23))
282
+ #loc112 = loc("tmp6"(#loc24))
283
+ #loc113 = loc("tmp7"(#loc25))
284
+ #loc114 = loc("tmp8"(#loc26))
285
+ #loc115 = loc("tmp9"(#loc27))
286
+ #loc116 = loc("tmp10"(#loc28))
287
+ #loc117 = loc("tmp11"(#loc29))
288
+ #loc118 = loc("tmp12"(#loc30))
289
+ #loc119 = loc("tmp15"(#loc31))
290
+ #loc120 = loc("tmp16"(#loc32))
291
+ #loc121 = loc("tmp16"(#loc33))
292
+ #loc122 = loc("tmp17"(#loc34))
293
+ #loc123 = loc("tmp18"(#loc35))
294
+ #loc124 = loc("tmp19"(#loc36))
295
+ #loc125 = loc("tmp20"(#loc37))
296
+ #loc126 = loc("tmp21"(#loc38))
297
+ #loc127 = loc("tmp22"(#loc39))
298
+ #loc131 = loc("mask"(#loc53))
299
+ #loc132 = loc("mask"(#loc56))
300
+ #loc133 = loc("mask"(#loc57))
301
+ #loc139 = loc("mask"(#loc131))
302
+ #loc140 = loc("mask"(#loc133))
progress/SpecForge/cache/compiled_kernels/triton/3/2GPX4H2RCLQEV2LIJWPK5V4BGHUBOBJI3FUMPIZKQJ7CMB34OTAA/triton_per_fused_mul_1.ttgir ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 8], order = [0, 1]}>
2
+ #blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 2], order = [0, 1]}>
3
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":18:0)
4
+ #loc1 = loc(unknown)
5
+ #loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":38:37)
6
+ #loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":48:26)
7
+ #loc43 = loc("in_ptr0"(#loc))
8
+ #loc44 = loc("in_ptr1"(#loc))
9
+ #loc45 = loc("out_ptr0"(#loc))
10
+ #loc46 = loc("out_ptr1"(#loc))
11
+ #loc47 = loc("out_ptr2"(#loc))
12
+ #loc48 = loc("xnumel"(#loc))
13
+ #loc49 = loc("r0_numel"(#loc))
14
+ #loc63 = loc("tmp4"(#loc16))
15
+ #loc74 = loc("tmp16"(#loc29))
16
+ #loc81 = loc(callsite(#loc1 at #loc63))
17
+ #loc85 = loc(callsite(#loc1 at #loc74))
18
+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
19
+ tt.func public @triton_per_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
20
+ %cst = arith.constant dense<1536> : tensor<1x32xi32, #blocked> loc(#loc1)
21
+ %cst_0 = arith.constant dense<1536> : tensor<128x1xi32, #blocked1> loc(#loc1)
22
+ %cst_1 = arith.constant dense<1536> : tensor<128x1xi32, #blocked> loc(#loc1)
23
+ %c128_i32 = arith.constant 128 : i32 loc(#loc1)
24
+ %cst_2 = arith.constant dense<0.693147182> : tensor<128x1xf32, #blocked> loc(#loc1)
25
+ %cst_3 = arith.constant dense<1.000000e+00> : tensor<128x1xf32, #blocked> loc(#loc1)
26
+ %cst_4 = arith.constant dense<0xFF800000> : tensor<128x1xf32, #blocked> loc(#loc1)
27
+ %cst_5 = arith.constant dense<0xFF800000> : tensor<128x32xf32, #blocked> loc(#loc1)
28
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<128x32xf32, #blocked> loc(#loc1)
29
+ %xoffset = tt.get_program_id x : i32 loc(#loc50)
30
+ %xoffset_7 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc51)
31
+ %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc52)
32
+ %xindex_8 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc52)
33
+ %xindex_9 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc52)
34
+ %xindex_10 = tt.expand_dims %xindex_8 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xi32, #blocked1> loc(#loc52)
35
+ %xindex_11 = tt.splat %xoffset_7 : i32 -> tensor<128x1xi32, #blocked> loc(#loc53)
36
+ %xindex_12 = tt.splat %xoffset_7 : i32 -> tensor<128x1xi32, #blocked1> loc(#loc53)
37
+ %xindex_13 = arith.addi %xindex_11, %xindex_9 : tensor<128x1xi32, #blocked> loc(#loc53)
38
+ %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<128x1xi32, #blocked1> loc(#loc53)
39
+ %xmask = arith.cmpi slt, %xindex_13, %cst_1 : tensor<128x1xi32, #blocked> loc(#loc54)
40
+ %xmask_15 = arith.cmpi slt, %xindex_14, %cst_0 : tensor<128x1xi32, #blocked1> loc(#loc54)
41
+ %r0_index = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc55)
42
+ %r0_index_16 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> loc(#loc55)
43
+ %tmp0 = arith.muli %r0_index_16, %cst : tensor<1x32xi32, #blocked> loc(#loc56)
44
+ %tmp0_17 = tt.broadcast %xindex_13 : tensor<128x1xi32, #blocked> -> tensor<128x32xi32, #blocked> loc(#loc57)
45
+ %tmp0_18 = tt.broadcast %tmp0 : tensor<1x32xi32, #blocked> -> tensor<128x32xi32, #blocked> loc(#loc57)
46
+ %tmp0_19 = arith.addi %tmp0_17, %tmp0_18 : tensor<128x32xi32, #blocked> loc(#loc57)
47
+ %tmp0_20 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x32x!tt.ptr<f32>, #blocked> loc(#loc58)
48
+ %tmp0_21 = tt.addptr %tmp0_20, %tmp0_19 : tensor<128x32x!tt.ptr<f32>, #blocked>, tensor<128x32xi32, #blocked> loc(#loc58)
49
+ %tmp0_22 = tt.broadcast %xmask : tensor<128x1xi1, #blocked> -> tensor<128x32xi1, #blocked> loc(#loc59)
50
+ %tmp0_23 = tt.load %tmp0_21, %tmp0_22, %cst_6 : tensor<128x32x!tt.ptr<f32>, #blocked> loc(#loc59)
51
+ %tmp5 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<128x32x!tt.ptr<f32>, #blocked> loc(#loc60)
52
+ %tmp5_24 = tt.addptr %tmp5, %tmp0_19 : tensor<128x32x!tt.ptr<f32>, #blocked>, tensor<128x32xi32, #blocked> loc(#loc60)
53
+ %tmp5_25 = tt.load %tmp5_24, %tmp0_22, %cst_6 : tensor<128x32x!tt.ptr<f32>, #blocked> loc(#loc61)
54
+ %tmp3 = arith.select %tmp0_22, %tmp0_23, %cst_5 : tensor<128x32xi1, #blocked>, tensor<128x32xf32, #blocked> loc(#loc62)
55
+ %tmp4 = "tt.reduce"(%tmp3) <{axis = 1 : i32}> ({
56
+ ^bb0(%tmp4_34: f32 loc(callsite(#loc1 at #loc63)), %tmp4_35: f32 loc(callsite(#loc1 at #loc63))):
57
+ %mask = arith.cmpf ogt, %tmp4_34, %tmp4_35 : f32 loc(#loc86)
58
+ %mask_36 = arith.cmpf une, %tmp4_34, %tmp4_34 : f32 loc(#loc87)
59
+ %mask_37 = arith.ori %mask, %mask_36 : i1 loc(#loc88)
60
+ %tmp4_38 = arith.select %mask_37, %tmp4_34, %tmp4_35 : f32 loc(#loc89)
61
+ tt.reduce.return %tmp4_38 : f32 loc(#loc80)
62
+ }) : (tensor<128x32xf32, #blocked>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc80)
63
+ %tmp4_26 = ttg.convert_layout %tmp4 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc67)
64
+ %tmp4_27 = tt.expand_dims %tmp4_26 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xf32, #blocked1> loc(#loc67)
65
+ %tmp4_28 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xf32, #blocked> loc(#loc67)
66
+ %tmp7 = arith.cmpf oeq, %tmp4_28, %cst_4 : tensor<128x1xf32, #blocked> loc(#loc68)
67
+ %tmp8 = tt.broadcast %tmp4_28 : tensor<128x1xf32, #blocked> -> tensor<128x32xf32, #blocked> loc(#loc69)
68
+ %tmp8_29 = arith.subf %tmp0_23, %tmp8 : tensor<128x32xf32, #blocked> loc(#loc69)
69
+ %tmp10 = tt.broadcast %tmp7 : tensor<128x1xi1, #blocked> -> tensor<128x32xi1, #blocked> loc(#loc70)
70
+ %tmp10_30 = arith.select %tmp10, %cst_6, %tmp8_29 : tensor<128x32xi1, #blocked>, tensor<128x32xf32, #blocked> loc(#loc70)
71
+ %tmp11 = tt.extern_elementwise %tmp10_30 {libname = "", libpath = "", pure = true, symbol = "__nv_exp2f"} : (tensor<128x32xf32, #blocked>) -> tensor<128x32xf32, #blocked> loc(#loc71)
72
+ %tmp12 = arith.mulf %tmp5_25, %tmp11 : tensor<128x32xf32, #blocked> loc(#loc72)
73
+ %tmp15 = arith.select %tmp0_22, %tmp12, %cst_6 : tensor<128x32xi1, #blocked>, tensor<128x32xf32, #blocked> loc(#loc73)
74
+ %tmp16 = "tt.reduce"(%tmp15) <{axis = 1 : i32}> ({
75
+ ^bb0(%tmp16_34: f32 loc(callsite(#loc1 at #loc74)), %tmp16_35: f32 loc(callsite(#loc1 at #loc74))):
76
+ %tmp16_36 = arith.addf %tmp16_34, %tmp16_35 : f32 loc(#loc90)
77
+ tt.reduce.return %tmp16_36 : f32 loc(#loc84)
78
+ }) : (tensor<128x32xf32, #blocked>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc84)
79
+ %tmp16_31 = ttg.convert_layout %tmp16 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc75)
80
+ %tmp16_32 = tt.expand_dims %tmp16_31 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xf32, #blocked1> loc(#loc75)
81
+ %tmp16_33 = tt.expand_dims %tmp16 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xf32, #blocked> loc(#loc75)
82
+ %tmp18 = arith.select %tmp7, %cst_3, %tmp16_33 : tensor<128x1xi1, #blocked>, tensor<128x1xf32, #blocked> loc(#loc76)
83
+ %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_log2f"} : (tensor<128x1xf32, #blocked>) -> tensor<128x1xf32, #blocked> loc(#loc77)
84
+ %tmp20 = arith.addf %tmp19, %tmp4_28 : tensor<128x1xf32, #blocked> loc(#loc78)
85
+ %tmp22 = arith.mulf %tmp20, %cst_2 : tensor<128x1xf32, #blocked> loc(#loc79)
86
+ %0 = tt.splat %out_ptr2 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>, #blocked1> loc(#loc36)
87
+ %1 = tt.addptr %0, %xindex_14 : tensor<128x1x!tt.ptr<f32>, #blocked1>, tensor<128x1xi32, #blocked1> loc(#loc36)
88
+ %2 = ttg.convert_layout %tmp22 : tensor<128x1xf32, #blocked> -> tensor<128x1xf32, #blocked1> loc(#loc37)
89
+ tt.store %1, %2, %xmask_15 : tensor<128x1x!tt.ptr<f32>, #blocked1> loc(#loc37)
90
+ %3 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>, #blocked1> loc(#loc38)
91
+ %4 = tt.addptr %3, %xindex_14 : tensor<128x1x!tt.ptr<f32>, #blocked1>, tensor<128x1xi32, #blocked1> loc(#loc38)
92
+ tt.store %4, %tmp4_27, %xmask_15 : tensor<128x1x!tt.ptr<f32>, #blocked1> loc(#loc39)
93
+ %5 = tt.splat %out_ptr1 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>, #blocked1> loc(#loc40)
94
+ %6 = tt.addptr %5, %xindex_14 : tensor<128x1x!tt.ptr<f32>, #blocked1>, tensor<128x1xi32, #blocked1> loc(#loc40)
95
+ tt.store %6, %tmp16_32, %xmask_15 : tensor<128x1x!tt.ptr<f32>, #blocked1> loc(#loc41)
96
+ tt.return loc(#loc42)
97
+ } loc(#loc)
98
+ } loc(#loc)
99
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":24:28)
100
+ #loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":24:33)
101
+ #loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":25:44)
102
+ #loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":25:23)
103
+ #loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":26:21)
104
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":27:38)
105
+ #loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":34:40)
106
+ #loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":34:35)
107
+ #loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":34:30)
108
+ #loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":34:47)
109
+ #loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":35:30)
110
+ #loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":35:47)
111
+ #loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":37:33)
112
+ #loc15 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29)
113
+ #loc17 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15)
114
+ #loc18 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21)
115
+ #loc19 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16)
116
+ #loc20 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29)
117
+ #loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":38:40)
118
+ #loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":40:19)
119
+ #loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":41:18)
120
+ #loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":43:33)
121
+ #loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":44:27)
122
+ #loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":45:19)
123
+ #loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":47:35)
124
+ #loc28 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
125
+ #loc30 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
126
+ #loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":48:29)
127
+ #loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":50:34)
128
+ #loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":51:27)
129
+ #loc34 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":52:20)
130
+ #loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":54:20)
131
+ #loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":55:25)
132
+ #loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":55:37)
133
+ #loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":56:25)
134
+ #loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":56:36)
135
+ #loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":57:25)
136
+ #loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":57:37)
137
+ #loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":57:4)
138
+ #loc50 = loc("xoffset"(#loc2))
139
+ #loc51 = loc("xoffset"(#loc3))
140
+ #loc52 = loc("xindex"(#loc4))
141
+ #loc53 = loc("xindex"(#loc5))
142
+ #loc54 = loc("xmask"(#loc6))
143
+ #loc55 = loc("r0_index"(#loc7))
144
+ #loc56 = loc("tmp0"(#loc8))
145
+ #loc57 = loc("tmp0"(#loc9))
146
+ #loc58 = loc("tmp0"(#loc10))
147
+ #loc59 = loc("tmp0"(#loc11))
148
+ #loc60 = loc("tmp5"(#loc12))
149
+ #loc61 = loc("tmp5"(#loc13))
150
+ #loc62 = loc("tmp3"(#loc14))
151
+ #loc64 = loc("mask"(#loc17))
152
+ #loc65 = loc("mask"(#loc18))
153
+ #loc66 = loc("mask"(#loc19))
154
+ #loc67 = loc("tmp4"(#loc21))
155
+ #loc68 = loc("tmp7"(#loc22))
156
+ #loc69 = loc("tmp8"(#loc23))
157
+ #loc70 = loc("tmp10"(#loc24))
158
+ #loc71 = loc("tmp11"(#loc25))
159
+ #loc72 = loc("tmp12"(#loc26))
160
+ #loc73 = loc("tmp15"(#loc27))
161
+ #loc75 = loc("tmp16"(#loc31))
162
+ #loc76 = loc("tmp18"(#loc32))
163
+ #loc77 = loc("tmp19"(#loc33))
164
+ #loc78 = loc("tmp20"(#loc34))
165
+ #loc79 = loc("tmp22"(#loc35))
166
+ #loc80 = loc(callsite(#loc15 at #loc63))
167
+ #loc82 = loc("mask"(#loc64))
168
+ #loc83 = loc("mask"(#loc66))
169
+ #loc84 = loc(callsite(#loc28 at #loc74))
170
+ #loc86 = loc(callsite(#loc82 at #loc80))
171
+ #loc87 = loc(callsite(#loc65 at #loc80))
172
+ #loc88 = loc(callsite(#loc83 at #loc80))
173
+ #loc89 = loc(callsite(#loc20 at #loc80))
174
+ #loc90 = loc(callsite(#loc30 at #loc84))
progress/SpecForge/cache/compiled_kernels/triton/3/2GPX4H2RCLQEV2LIJWPK5V4BGHUBOBJI3FUMPIZKQJ7CMB34OTAA/triton_per_fused_mul_1.ttir ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":18:0)
2
+ #loc5 = loc(unknown)
3
+ #loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":38:37)
4
+ #loc33 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":48:26)
5
+ #loc45 = loc("in_ptr0"(#loc))
6
+ #loc46 = loc("in_ptr1"(#loc))
7
+ #loc47 = loc("out_ptr0"(#loc))
8
+ #loc48 = loc("out_ptr1"(#loc))
9
+ #loc49 = loc("out_ptr2"(#loc))
10
+ #loc50 = loc("xnumel"(#loc))
11
+ #loc51 = loc("r0_numel"(#loc))
12
+ #loc70 = loc("tmp4"(#loc21))
13
+ #loc80 = loc("tmp16"(#loc33))
14
+ #loc85 = loc(callsite(#loc5 at #loc70))
15
+ #loc89 = loc(callsite(#loc5 at #loc80))
16
+ module {
17
+ tt.func public @triton_per_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
18
+ %tmp22 = arith.constant dense<0.693147182> : tensor<128x1xf32> loc(#loc52)
19
+ %tmp18 = arith.constant dense<1.000000e+00> : tensor<128x1xf32> loc(#loc53)
20
+ %tmp7 = arith.constant dense<0xFF800000> : tensor<128x1xf32> loc(#loc54)
21
+ %tmp3 = arith.constant dense<0xFF800000> : tensor<128x32xf32> loc(#loc55)
22
+ %cst = arith.constant dense<0.000000e+00> : tensor<128x32xf32> loc(#loc5)
23
+ %cst_0 = arith.constant dense<1536> : tensor<1x32xi32> loc(#loc5)
24
+ %xmask = arith.constant dense<1536> : tensor<128x1xi32> loc(#loc56)
25
+ %c128_i32 = arith.constant 128 : i32 loc(#loc5)
26
+ %xoffset = tt.get_program_id x : i32 loc(#loc57)
27
+ %xoffset_1 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc58)
28
+ %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc59)
29
+ %xindex_2 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc60)
30
+ %xindex_3 = tt.splat %xoffset_1 : i32 -> tensor<128x1xi32> loc(#loc61)
31
+ %xindex_4 = arith.addi %xindex_3, %xindex_2 : tensor<128x1xi32> loc(#loc61)
32
+ %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<128x1xi32> loc(#loc56)
33
+ %r0_index = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc62)
34
+ %r0_index_6 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc63)
35
+ %tmp0 = arith.muli %r0_index_6, %cst_0 : tensor<1x32xi32> loc(#loc64)
36
+ %tmp0_7 = tt.broadcast %xindex_4 : tensor<128x1xi32> -> tensor<128x32xi32> loc(#loc65)
37
+ %tmp0_8 = tt.broadcast %tmp0 : tensor<1x32xi32> -> tensor<128x32xi32> loc(#loc65)
38
+ %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<128x32xi32> loc(#loc65)
39
+ %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x32x!tt.ptr<f32>> loc(#loc66)
40
+ %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<128x32x!tt.ptr<f32>>, tensor<128x32xi32> loc(#loc66)
41
+ %tmp0_12 = tt.broadcast %xmask_5 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc67)
42
+ %tmp0_13 = tt.load %tmp0_11, %tmp0_12, %cst : tensor<128x32x!tt.ptr<f32>> loc(#loc67)
43
+ %tmp5 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<128x32x!tt.ptr<f32>> loc(#loc68)
44
+ %tmp5_14 = tt.addptr %tmp5, %tmp0_9 : tensor<128x32x!tt.ptr<f32>>, tensor<128x32xi32> loc(#loc68)
45
+ %tmp5_15 = tt.load %tmp5_14, %tmp0_12, %cst : tensor<128x32x!tt.ptr<f32>> loc(#loc69)
46
+ %tmp3_16 = arith.select %tmp0_12, %tmp0_13, %tmp3 : tensor<128x32xi1>, tensor<128x32xf32> loc(#loc55)
47
+ %tmp4 = "tt.reduce"(%tmp3_16) <{axis = 1 : i32}> ({
48
+ ^bb0(%tmp4_24: f32 loc(callsite(#loc5 at #loc70)), %tmp4_25: f32 loc(callsite(#loc5 at #loc70))):
49
+ %mask = arith.cmpf ogt, %tmp4_24, %tmp4_25 : f32 loc(#loc90)
50
+ %mask_26 = arith.cmpf une, %tmp4_24, %tmp4_24 : f32 loc(#loc91)
51
+ %mask_27 = arith.ori %mask, %mask_26 : i1 loc(#loc92)
52
+ %tmp4_28 = arith.select %mask_27, %tmp4_24, %tmp4_25 : f32 loc(#loc93)
53
+ tt.reduce.return %tmp4_28 : f32 loc(#loc84)
54
+ }) : (tensor<128x32xf32>) -> tensor<128xf32> loc(#loc84)
55
+ %tmp4_17 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc74)
56
+ %tmp7_18 = arith.cmpf oeq, %tmp4_17, %tmp7 : tensor<128x1xf32> loc(#loc54)
57
+ %tmp8 = tt.broadcast %tmp4_17 : tensor<128x1xf32> -> tensor<128x32xf32> loc(#loc75)
58
+ %tmp8_19 = arith.subf %tmp0_13, %tmp8 : tensor<128x32xf32> loc(#loc75)
59
+ %tmp10 = tt.broadcast %tmp7_18 : tensor<128x1xi1> -> tensor<128x32xi1> loc(#loc76)
60
+ %tmp10_20 = arith.select %tmp10, %cst, %tmp8_19 : tensor<128x32xi1>, tensor<128x32xf32> loc(#loc76)
61
+ %tmp11 = tt.extern_elementwise %tmp10_20 {libname = "", libpath = "", pure = true, symbol = "__nv_exp2f"} : (tensor<128x32xf32>) -> tensor<128x32xf32> loc(#loc77)
62
+ %tmp12 = arith.mulf %tmp5_15, %tmp11 : tensor<128x32xf32> loc(#loc78)
63
+ %tmp15 = arith.select %tmp0_12, %tmp12, %cst : tensor<128x32xi1>, tensor<128x32xf32> loc(#loc79)
64
+ %tmp16 = "tt.reduce"(%tmp15) <{axis = 1 : i32}> ({
65
+ ^bb0(%tmp16_24: f32 loc(callsite(#loc5 at #loc80)), %tmp16_25: f32 loc(callsite(#loc5 at #loc80))):
66
+ %tmp16_26 = arith.addf %tmp16_24, %tmp16_25 : f32 loc(#loc94)
67
+ tt.reduce.return %tmp16_26 : f32 loc(#loc88)
68
+ }) : (tensor<128x32xf32>) -> tensor<128xf32> loc(#loc88)
69
+ %tmp16_21 = tt.expand_dims %tmp16 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc81)
70
+ %tmp18_22 = arith.select %tmp7_18, %tmp18, %tmp16_21 : tensor<128x1xi1>, tensor<128x1xf32> loc(#loc53)
71
+ %tmp19 = tt.extern_elementwise %tmp18_22 {libname = "", libpath = "", pure = true, symbol = "__nv_log2f"} : (tensor<128x1xf32>) -> tensor<128x1xf32> loc(#loc82)
72
+ %tmp20 = arith.addf %tmp19, %tmp4_17 : tensor<128x1xf32> loc(#loc83)
73
+ %tmp22_23 = arith.mulf %tmp20, %tmp22 : tensor<128x1xf32> loc(#loc52)
74
+ %0 = tt.splat %out_ptr2 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>> loc(#loc38)
75
+ %1 = tt.addptr %0, %xindex_4 : tensor<128x1x!tt.ptr<f32>>, tensor<128x1xi32> loc(#loc38)
76
+ tt.store %1, %tmp22_23, %xmask_5 : tensor<128x1x!tt.ptr<f32>> loc(#loc39)
77
+ %2 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>> loc(#loc40)
78
+ %3 = tt.addptr %2, %xindex_4 : tensor<128x1x!tt.ptr<f32>>, tensor<128x1xi32> loc(#loc40)
79
+ tt.store %3, %tmp4_17, %xmask_5 : tensor<128x1x!tt.ptr<f32>> loc(#loc41)
80
+ %4 = tt.splat %out_ptr1 : !tt.ptr<f32> -> tensor<128x1x!tt.ptr<f32>> loc(#loc42)
81
+ %5 = tt.addptr %4, %xindex_4 : tensor<128x1x!tt.ptr<f32>>, tensor<128x1xi32> loc(#loc42)
82
+ tt.store %5, %tmp16_21, %xmask_5 : tensor<128x1x!tt.ptr<f32>> loc(#loc43)
83
+ tt.return loc(#loc44)
84
+ } loc(#loc)
85
+ } loc(#loc)
86
+ #loc1 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":54:20)
87
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":50:34)
88
+ #loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":40:19)
89
+ #loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":37:33)
90
+ #loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":26:21)
91
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":24:28)
92
+ #loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":24:33)
93
+ #loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":25:36)
94
+ #loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":25:44)
95
+ #loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":25:23)
96
+ #loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":27:28)
97
+ #loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":27:38)
98
+ #loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":34:40)
99
+ #loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":34:35)
100
+ #loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":34:30)
101
+ #loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":34:47)
102
+ #loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":35:30)
103
+ #loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":35:47)
104
+ #loc20 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29)
105
+ #loc22 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15)
106
+ #loc23 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21)
107
+ #loc24 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16)
108
+ #loc25 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29)
109
+ #loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":38:40)
110
+ #loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":41:18)
111
+ #loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":43:33)
112
+ #loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":44:27)
113
+ #loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":45:19)
114
+ #loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":47:35)
115
+ #loc32 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
116
+ #loc34 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
117
+ #loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":48:29)
118
+ #loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":51:27)
119
+ #loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":52:20)
120
+ #loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":55:25)
121
+ #loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":55:37)
122
+ #loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":56:25)
123
+ #loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":56:36)
124
+ #loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":57:25)
125
+ #loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":57:37)
126
+ #loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/r5/cr5gimrtqf5gqmgkgirz6nebp4ijwtbzhmqlsdgjqda33dr43qre.py":57:4)
127
+ #loc52 = loc("tmp22"(#loc1))
128
+ #loc53 = loc("tmp18"(#loc2))
129
+ #loc54 = loc("tmp7"(#loc3))
130
+ #loc55 = loc("tmp3"(#loc4))
131
+ #loc56 = loc("xmask"(#loc6))
132
+ #loc57 = loc("xoffset"(#loc7))
133
+ #loc58 = loc("xoffset"(#loc8))
134
+ #loc59 = loc("xindex"(#loc9))
135
+ #loc60 = loc("xindex"(#loc10))
136
+ #loc61 = loc("xindex"(#loc11))
137
+ #loc62 = loc("r0_index"(#loc12))
138
+ #loc63 = loc("r0_index"(#loc13))
139
+ #loc64 = loc("tmp0"(#loc14))
140
+ #loc65 = loc("tmp0"(#loc15))
141
+ #loc66 = loc("tmp0"(#loc16))
142
+ #loc67 = loc("tmp0"(#loc17))
143
+ #loc68 = loc("tmp5"(#loc18))
144
+ #loc69 = loc("tmp5"(#loc19))
145
+ #loc71 = loc("mask"(#loc22))
146
+ #loc72 = loc("mask"(#loc23))
147
+ #loc73 = loc("mask"(#loc24))
148
+ #loc74 = loc("tmp4"(#loc26))
149
+ #loc75 = loc("tmp8"(#loc27))
150
+ #loc76 = loc("tmp10"(#loc28))
151
+ #loc77 = loc("tmp11"(#loc29))
152
+ #loc78 = loc("tmp12"(#loc30))
153
+ #loc79 = loc("tmp15"(#loc31))
154
+ #loc81 = loc("tmp16"(#loc35))
155
+ #loc82 = loc("tmp19"(#loc36))
156
+ #loc83 = loc("tmp20"(#loc37))
157
+ #loc84 = loc(callsite(#loc20 at #loc70))
158
+ #loc86 = loc("mask"(#loc71))
159
+ #loc87 = loc("mask"(#loc73))
160
+ #loc88 = loc(callsite(#loc32 at #loc80))
161
+ #loc90 = loc(callsite(#loc86 at #loc84))
162
+ #loc91 = loc(callsite(#loc72 at #loc84))
163
+ #loc92 = loc(callsite(#loc87 at #loc84))
164
+ #loc93 = loc(callsite(#loc25 at #loc84))
165
+ #loc94 = loc(callsite(#loc34 at #loc88))
progress/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/__grp__triton_poi_fused_mul_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_poi_fused_mul_1.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.source", "triton_poi_fused_mul_1.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttir", "triton_poi_fused_mul_1.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttgir", "triton_poi_fused_mul_1.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.llir", "triton_poi_fused_mul_1.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ptx", "triton_poi_fused_mul_1.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.cubin", "triton_poi_fused_mul_1.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.json"}}
progress/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.cubin ADDED
Binary file (10.3 kB). View file
 
progress/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "d650530c018e98a61be4958bef98391bf5f4932885981bbdd4c94cc375f6e8e4", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_mul_1"}
progress/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.llir ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+ target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
4
+
5
+ ; Function Attrs: nounwind
6
+ define ptx_kernel void @triton_poi_fused_mul_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
7
+ %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
8
+ %8 = shl i32 %7, 7, !dbg !8
9
+ %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
10
+ %10 = and i32 %9, 127, !dbg !9
11
+ %11 = or disjoint i32 %8, %10, !dbg !10
12
+ %12 = icmp slt i32 %11, %3, !dbg !11
13
+ %13 = sext i32 %11 to i64, !dbg !12
14
+ %.frozen = freeze i64 %2, !dbg !13
15
+ %14 = sdiv i64 %13, %.frozen, !dbg !13
16
+ %15 = mul i64 %14, %.frozen, !dbg !12
17
+ %.decomposed = sub i64 %13, %15, !dbg !12
18
+ %.not = icmp ne i64 %.decomposed, 0, !dbg !17
19
+ %16 = icmp slt i32 %8, 0, !dbg !18
20
+ %17 = icmp slt i64 %2, 0, !dbg !19
21
+ %18 = xor i1 %16, %17, !dbg !20
22
+ %narrow = select i1 %18, i1 %.not, i1 false, !dbg !21
23
+ %19 = sext i1 %narrow to i64, !dbg !21
24
+ %20 = add nsw i64 %14, %19, !dbg !21
25
+ %21 = getelementptr float, ptr addrspace(1) %0, i64 %13, !dbg !22
26
+ %22 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !23
27
+ %23 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %21, i64 %22, i1 %12) #2, !dbg !23
28
+ %24 = bitcast i32 %23 to float, !dbg !23
29
+ %25 = fmul float %24, 0x3FE62E4300000000, !dbg !24
30
+ %26 = icmp slt i64 %2, 2, !dbg !25
31
+ %27 = icmp sgt i64 %2, 1, !dbg !26
32
+ %28 = select i1 %27, i64 %2, i64 0, !dbg !27
33
+ %29 = zext i1 %26 to i64, !dbg !28
34
+ %30 = add i64 %28, %29, !dbg !29
35
+ %31 = mul i64 %20, %30, !dbg !30
36
+ %32 = getelementptr float, ptr addrspace(1) %1, i64 %.decomposed, !dbg !31
37
+ %33 = getelementptr float, ptr addrspace(1) %32, i64 %31, !dbg !31
38
+ %34 = bitcast float %25 to i32, !dbg !32
39
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %34, ptr addrspace(1) %33, i1 %12) #2, !dbg !32
40
+ ret void, !dbg !33
41
+ }
42
+
43
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
44
+ declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
45
+
46
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
47
+ declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
48
+
49
+ attributes #0 = { nounwind "nvvm.reqntid"="128" }
50
+ attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
51
+ attributes #2 = { nounwind }
52
+
53
+ !llvm.dbg.cu = !{!0}
54
+ !llvm.module.flags = !{!2, !3}
55
+
56
+ !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
57
+ !1 = !DIFile(filename: "csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py", directory: "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz")
58
+ !2 = !{i32 2, !"Debug Info Version", i32 3}
59
+ !3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
60
+ !4 = distinct !DISubprogram(name: "triton_poi_fused_mul_1", linkageName: "triton_poi_fused_mul_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
61
+ !5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
62
+ !6 = !{}
63
+ !7 = !DILocation(line: 19, column: 28, scope: !4)
64
+ !8 = !DILocation(line: 19, column: 33, scope: !4)
65
+ !9 = !DILocation(line: 20, column: 36, scope: !4)
66
+ !10 = !DILocation(line: 20, column: 23, scope: !4)
67
+ !11 = !DILocation(line: 21, column: 21, scope: !4)
68
+ !12 = !DILocation(line: 23, column: 19, scope: !4)
69
+ !13 = !DILocation(line: 72, column: 16, scope: !14, inlinedAt: !16)
70
+ !14 = distinct !DILexicalBlockFile(scope: !4, file: !15, discriminator: 0)
71
+ !15 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime")
72
+ !16 = !DILocation(line: 24, column: 51, scope: !4)
73
+ !17 = !DILocation(line: 74, column: 34, scope: !14, inlinedAt: !16)
74
+ !18 = !DILocation(line: 75, column: 25, scope: !14, inlinedAt: !16)
75
+ !19 = !DILocation(line: 75, column: 36, scope: !14, inlinedAt: !16)
76
+ !20 = !DILocation(line: 75, column: 32, scope: !14, inlinedAt: !16)
77
+ !21 = !DILocation(line: 75, column: 47, scope: !14, inlinedAt: !16)
78
+ !22 = !DILocation(line: 25, column: 30, scope: !4)
79
+ !23 = !DILocation(line: 25, column: 35, scope: !4)
80
+ !24 = !DILocation(line: 27, column: 18, scope: !4)
81
+ !25 = !DILocation(line: 28, column: 49, scope: !4)
82
+ !26 = !DILocation(line: 28, column: 75, scope: !4)
83
+ !27 = !DILocation(line: 28, column: 66, scope: !4)
84
+ !28 = !DILocation(line: 28, scope: !4)
85
+ !29 = !DILocation(line: 28, column: 57, scope: !4)
86
+ !30 = !DILocation(line: 28, column: 34, scope: !4)
87
+ !31 = !DILocation(line: 28, column: 25, scope: !4)
88
+ !32 = !DILocation(line: 28, column: 88, scope: !4)
89
+ !33 = !DILocation(line: 28, column: 4, scope: !4)
progress/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ptx ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.7
6
+ .target sm_90a
7
+ .address_size 64
8
+
9
+ // .globl triton_poi_fused_mul_1 // -- Begin function triton_poi_fused_mul_1
10
+ // @triton_poi_fused_mul_1
11
+ .visible .entry triton_poi_fused_mul_1(
12
+ .param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_0,
13
+ .param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_1,
14
+ .param .u64 triton_poi_fused_mul_1_param_2,
15
+ .param .u32 triton_poi_fused_mul_1_param_3,
16
+ .param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_4,
17
+ .param .u64 .ptr .global .align 1 triton_poi_fused_mul_1_param_5
18
+ )
19
+ .reqntid 128
20
+ {
21
+ .reg .pred %p<11>;
22
+ .reg .b32 %r<13>;
23
+ .reg .b64 %rd<30>;
24
+ .loc 1 18 0 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:18:0
25
+ $L__func_begin0:
26
+ .loc 1 18 0 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:18:0
27
+
28
+ // %bb.0:
29
+ ld.param.b32 %r2, [triton_poi_fused_mul_1_param_3];
30
+ ld.param.b64 %rd7, [triton_poi_fused_mul_1_param_1];
31
+ ld.param.b64 %rd6, [triton_poi_fused_mul_1_param_0];
32
+ ld.param.b64 %rd8, [triton_poi_fused_mul_1_param_2];
33
+ $L__tmp0:
34
+ .loc 1 19 28 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:19:28
35
+ mov.u32 %r3, %ctaid.x;
36
+ .loc 1 19 33 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:19:33
37
+ shl.b32 %r1, %r3, 7;
38
+ .loc 1 20 36 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:20:36
39
+ mov.u32 %r4, %tid.x;
40
+ and.b32 %r5, %r4, 127;
41
+ .loc 1 20 23 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:20:23
42
+ or.b32 %r6, %r1, %r5;
43
+ .loc 1 23 19 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:23:19
44
+ cvt.s64.s32 %rd1, %r6;
45
+ $L__tmp1:
46
+ .loc 2 72 16 // triton_helpers.py:72:16 @[ csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:24:51 ]
47
+ or.b64 %rd10, %rd1, %rd8;
48
+ and.b64 %rd11, %rd10, -4294967296;
49
+ setp.ne.b64 %p1, %rd11, 0;
50
+ @%p1 bra $L__BB0_2;
51
+ bra.uni $L__BB0_1;
52
+ $L__BB0_2:
53
+ div.s64 %rd29, %rd1, %rd8;
54
+ bra.uni $L__BB0_3;
55
+ $L__BB0_1:
56
+ cvt.u32.u64 %r7, %rd8;
57
+ cvt.u32.u64 %r8, %rd1;
58
+ div.u32 %r9, %r8, %r7;
59
+ cvt.u64.u32 %rd29, %r9;
60
+ $L__tmp2:
61
+ $L__BB0_3:
62
+ .loc 2 0 16 // triton_helpers.py:0:16
63
+ cvt.u32.u64 %r12, %rd1;
64
+ .loc 1 21 21 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:21:21
65
+ setp.lt.s32 %p2, %r12, %r2;
66
+ .loc 1 23 19 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:23:19
67
+ mul.lo.s64 %rd17, %rd29, %rd8;
68
+ sub.s64 %rd18, %rd1, %rd17;
69
+ $L__tmp3:
70
+ .loc 2 74 34 // triton_helpers.py:74:34 @[ csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:24:51 ]
71
+ setp.ne.b64 %p4, %rd18, 0;
72
+ .loc 2 75 25 // triton_helpers.py:75:25 @[ csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:24:51 ]
73
+ setp.lt.s32 %p5, %r1, 0;
74
+ .loc 2 75 36 // triton_helpers.py:75:36 @[ csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:24:51 ]
75
+ setp.lt.s64 %p6, %rd8, 0;
76
+ .loc 2 75 32 // triton_helpers.py:75:32 @[ csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:24:51 ]
77
+ xor.pred %p7, %p5, %p6;
78
+ .loc 2 75 47 // triton_helpers.py:75:47 @[ csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:24:51 ]
79
+ and.pred %p8, %p7, %p4;
80
+ selp.b64 %rd19, -1, 0, %p8;
81
+ add.s64 %rd20, %rd29, %rd19;
82
+ $L__tmp4:
83
+ .loc 1 25 30 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:25:30
84
+ shl.b64 %rd21, %rd1, 2;
85
+ add.s64 %rd13, %rd6, %rd21;
86
+ .loc 1 25 35 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:25:35
87
+ // begin inline asm
88
+ mov.u64 %rd14, 0x0;
89
+ createpolicy.fractional.L2::evict_last.b64 %rd14, 1.0;
90
+ // end inline asm
91
+ // begin inline asm
92
+ mov.u32 %r10, 0x0;
93
+ @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r10 }, [ %rd13 + 0 ], %rd14;
94
+ // end inline asm
95
+ .loc 1 27 18 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:27:18
96
+ mul.f32 %r11, %r10, 0f3F317218;
97
+ .loc 1 28 49 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:28:49
98
+ setp.lt.s64 %p9, %rd8, 2;
99
+ .loc 1 28 75 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:28:75
100
+ setp.gt.s64 %p10, %rd8, 1;
101
+ .loc 1 28 66 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:28:66
102
+ selp.b64 %rd22, %rd8, 0, %p10;
103
+ .loc 1 28 0 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:28
104
+ selp.b64 %rd23, 1, 0, %p9;
105
+ .loc 1 28 57 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:28:57
106
+ add.s64 %rd24, %rd22, %rd23;
107
+ .loc 1 28 34 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:28:34
108
+ mul.lo.s64 %rd25, %rd20, %rd24;
109
+ .loc 1 28 25 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:28:25
110
+ shl.b64 %rd26, %rd18, 2;
111
+ add.s64 %rd27, %rd7, %rd26;
112
+ shl.b64 %rd28, %rd25, 2;
113
+ add.s64 %rd15, %rd27, %rd28;
114
+ .loc 1 28 88 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:28:88
115
+ // begin inline asm
116
+ @%p2 st.global.b32 [ %rd15 + 0 ], { %r11 };
117
+ // end inline asm
118
+ .loc 1 28 4 // csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py:28:4
119
+ ret;
120
+ $L__tmp5:
121
+ $L__func_end0:
122
+ // -- End function
123
+ }
124
+ .file 1 "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py"
125
+ .file 2 "/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py"
126
+ .section .debug_abbrev
127
+ {
128
+ .b8 1 // Abbreviation Code
129
+ .b8 17 // DW_TAG_compile_unit
130
+ .b8 1 // DW_CHILDREN_yes
131
+ .b8 37 // DW_AT_producer
132
+ .b8 8 // DW_FORM_string
133
+ .b8 19 // DW_AT_language
134
+ .b8 5 // DW_FORM_data2
135
+ .b8 3 // DW_AT_name
136
+ .b8 8 // DW_FORM_string
137
+ .b8 16 // DW_AT_stmt_list
138
+ .b8 6 // DW_FORM_data4
139
+ .b8 27 // DW_AT_comp_dir
140
+ .b8 8 // DW_FORM_string
141
+ .b8 0 // EOM(1)
142
+ .b8 0 // EOM(2)
143
+ .b8 2 // Abbreviation Code
144
+ .b8 46 // DW_TAG_subprogram
145
+ .b8 0 // DW_CHILDREN_no
146
+ .b8 3 // DW_AT_name
147
+ .b8 8 // DW_FORM_string
148
+ .b8 32 // DW_AT_inline
149
+ .b8 11 // DW_FORM_data1
150
+ .b8 0 // EOM(1)
151
+ .b8 0 // EOM(2)
152
+ .b8 3 // Abbreviation Code
153
+ .b8 46 // DW_TAG_subprogram
154
+ .b8 1 // DW_CHILDREN_yes
155
+ .b8 17 // DW_AT_low_pc
156
+ .b8 1 // DW_FORM_addr
157
+ .b8 18 // DW_AT_high_pc
158
+ .b8 1 // DW_FORM_addr
159
+ .b8 49 // DW_AT_abstract_origin
160
+ .b8 19 // DW_FORM_ref4
161
+ .b8 0 // EOM(1)
162
+ .b8 0 // EOM(2)
163
+ .b8 4 // Abbreviation Code
164
+ .b8 29 // DW_TAG_inlined_subroutine
165
+ .b8 0 // DW_CHILDREN_no
166
+ .b8 49 // DW_AT_abstract_origin
167
+ .b8 19 // DW_FORM_ref4
168
+ .b8 17 // DW_AT_low_pc
169
+ .b8 1 // DW_FORM_addr
170
+ .b8 18 // DW_AT_high_pc
171
+ .b8 1 // DW_FORM_addr
172
+ .b8 88 // DW_AT_call_file
173
+ .b8 11 // DW_FORM_data1
174
+ .b8 89 // DW_AT_call_line
175
+ .b8 11 // DW_FORM_data1
176
+ .b8 87 // DW_AT_call_column
177
+ .b8 11 // DW_FORM_data1
178
+ .b8 0 // EOM(1)
179
+ .b8 0 // EOM(2)
180
+ .b8 0 // EOM(3)
181
+ }
182
+ .section .debug_info
183
+ {
184
+ .b32 211 // Length of Unit
185
+ .b8 2 // DWARF version number
186
+ .b8 0
187
+ .b32 .debug_abbrev // Offset Into Abbrev. Section
188
+ .b8 8 // Address Size (in bytes)
189
+ .b8 1 // Abbrev [1] 0xb:0xcc DW_TAG_compile_unit
190
+ .b8 116 // DW_AT_producer
191
+ .b8 114
192
+ .b8 105
193
+ .b8 116
194
+ .b8 111
195
+ .b8 110
196
+ .b8 0
197
+ .b8 2 // DW_AT_language
198
+ .b8 0
199
+ .b8 99 // DW_AT_name
200
+ .b8 115
201
+ .b8 122
202
+ .b8 52
203
+ .b8 111
204
+ .b8 121
205
+ .b8 113
206
+ .b8 121
207
+ .b8 97
208
+ .b8 52
209
+ .b8 116
210
+ .b8 114
211
+ .b8 99
212
+ .b8 53
213
+ .b8 108
214
+ .b8 108
215
+ .b8 53
216
+ .b8 53
217
+ .b8 122
218
+ .b8 106
219
+ .b8 103
220
+ .b8 101
221
+ .b8 50
222
+ .b8 106
223
+ .b8 112
224
+ .b8 98
225
+ .b8 102
226
+ .b8 110
227
+ .b8 112
228
+ .b8 110
229
+ .b8 51
230
+ .b8 52
231
+ .b8 99
232
+ .b8 105
233
+ .b8 52
234
+ .b8 54
235
+ .b8 50
236
+ .b8 107
237
+ .b8 110
238
+ .b8 103
239
+ .b8 108
240
+ .b8 53
241
+ .b8 53
242
+ .b8 98
243
+ .b8 105
244
+ .b8 54
245
+ .b8 54
246
+ .b8 106
247
+ .b8 111
248
+ .b8 107
249
+ .b8 99
250
+ .b8 120
251
+ .b8 46
252
+ .b8 112
253
+ .b8 121
254
+ .b8 0
255
+ .b32 .debug_line // DW_AT_stmt_list
256
+ .b8 47 // DW_AT_comp_dir
257
+ .b8 119
258
+ .b8 111
259
+ .b8 114
260
+ .b8 107
261
+ .b8 115
262
+ .b8 112
263
+ .b8 97
264
+ .b8 99
265
+ .b8 101
266
+ .b8 47
267
+ .b8 104
268
+ .b8 97
269
+ .b8 110
270
+ .b8 114
271
+ .b8 117
272
+ .b8 105
273
+ .b8 47
274
+ .b8 106
275
+ .b8 117
276
+ .b8 110
277
+ .b8 113
278
+ .b8 117
279
+ .b8 97
280
+ .b8 110
281
+ .b8 47
282
+ .b8 83
283
+ .b8 112
284
+ .b8 101
285
+ .b8 99
286
+ .b8 70
287
+ .b8 111
288
+ .b8 114
289
+ .b8 103
290
+ .b8 101
291
+ .b8 47
292
+ .b8 99
293
+ .b8 97
294
+ .b8 99
295
+ .b8 104
296
+ .b8 101
297
+ .b8 47
298
+ .b8 99
299
+ .b8 111
300
+ .b8 109
301
+ .b8 112
302
+ .b8 105
303
+ .b8 108
304
+ .b8 101
305
+ .b8 100
306
+ .b8 95
307
+ .b8 107
308
+ .b8 101
309
+ .b8 114
310
+ .b8 110
311
+ .b8 101
312
+ .b8 108
313
+ .b8 115
314
+ .b8 47
315
+ .b8 115
316
+ .b8 122
317
+ .b8 0
318
+ .b8 2 // Abbrev [2] 0x8f:0x19 DW_TAG_subprogram
319
+ .b8 116 // DW_AT_name
320
+ .b8 114
321
+ .b8 105
322
+ .b8 116
323
+ .b8 111
324
+ .b8 110
325
+ .b8 95
326
+ .b8 112
327
+ .b8 111
328
+ .b8 105
329
+ .b8 95
330
+ .b8 102
331
+ .b8 117
332
+ .b8 115
333
+ .b8 101
334
+ .b8 100
335
+ .b8 95
336
+ .b8 109
337
+ .b8 117
338
+ .b8 108
339
+ .b8 95
340
+ .b8 49
341
+ .b8 0
342
+ .b8 1 // DW_AT_inline
343
+ .b8 3 // Abbrev [3] 0xa8:0x2e DW_TAG_subprogram
344
+ .b64 $L__func_begin0 // DW_AT_low_pc
345
+ .b64 $L__func_end0 // DW_AT_high_pc
346
+ .b32 143 // DW_AT_abstract_origin
347
+ .b8 4 // Abbrev [4] 0xbd:0x18 DW_TAG_inlined_subroutine
348
+ .b32 143 // DW_AT_abstract_origin
349
+ .b64 $L__tmp1 // DW_AT_low_pc
350
+ .b64 $L__tmp4 // DW_AT_high_pc
351
+ .b8 1 // DW_AT_call_file
352
+ .b8 24 // DW_AT_call_line
353
+ .b8 51 // DW_AT_call_column
354
+ .b8 0 // End Of Children Mark
355
+ .b8 0 // End Of Children Mark
356
+ }
357
+ .section .debug_macinfo { }
progress/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.source ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":18:0)
2
+ #loc22 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":69:0)
3
+ #loc34 = loc("in_ptr0"(#loc))
4
+ #loc35 = loc("out_ptr0"(#loc))
5
+ #loc36 = loc("ks0"(#loc))
6
+ #loc37 = loc("xnumel"(#loc))
7
+ #loc49 = loc("a"(#loc22))
8
+ #loc50 = loc("b"(#loc22))
9
+ module {
10
+ tt.func public @triton_poi_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
11
+ %xoffset = tt.get_program_id x : i32 loc(#loc38)
12
+ %xoffset_0 = arith.constant 128 : i32 loc(#loc39)
13
+ %xoffset_1 = arith.constant 128 : i32 loc(#loc39)
14
+ %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc39)
15
+ %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc40)
16
+ %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<128xi32> loc(#loc41)
17
+ %xindex_4 = arith.addi %xindex_3, %xindex : tensor<128xi32> loc(#loc41)
18
+ %xmask = tt.splat %xnumel : i32 -> tensor<128xi32> loc(#loc42)
19
+ %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<128xi32> loc(#loc42)
20
+ %x0 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc43)
21
+ %x0_6 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc43)
22
+ %x0_7 = arith.remsi %x0, %x0_6 : tensor<128xi64> loc(#loc43)
23
+ %x1 = tt.call @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S128S_i64__(%xindex_4, %ks0) : (tensor<128xi32>, i64) -> tensor<128xi64> loc(#loc44)
24
+ %tmp0 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>> loc(#loc45)
25
+ %tmp0_8 = tt.addptr %tmp0, %xindex_4 : tensor<128x!tt.ptr<f32>>, tensor<128xi32> loc(#loc45)
26
+ %tmp0_9 = tt.load %tmp0_8, %xmask_5 evictionPolicy = evict_last : tensor<128x!tt.ptr<f32>> loc(#loc46)
27
+ %tmp1 = arith.constant 0.693147182 : f32 loc(#loc47)
28
+ %tmp2 = arith.constant dense<0.693147182> : tensor<128xf32> loc(#loc48)
29
+ %tmp2_10 = arith.mulf %tmp0_9, %tmp2 : tensor<128xf32> loc(#loc48)
30
+ %c1_i32 = arith.constant 1 : i32 loc(#loc12)
31
+ %0 = arith.extsi %c1_i32 : i32 to i64 loc(#loc12)
32
+ %1 = arith.cmpi sge, %0, %ks0 : i64 loc(#loc12)
33
+ %c1_i32_11 = arith.constant 1 : i32 loc(#loc13)
34
+ %c1_i32_12 = arith.constant 1 : i32 loc(#loc13)
35
+ %2 = arith.extui %1 : i1 to i32 loc(#loc13)
36
+ %3 = arith.muli %c1_i32_12, %2 : i32 loc(#loc13)
37
+ %c1_i32_13 = arith.constant 1 : i32 loc(#loc14)
38
+ %4 = arith.extsi %c1_i32_13 : i32 to i64 loc(#loc14)
39
+ %5 = arith.cmpi sgt, %ks0, %4 : i64 loc(#loc14)
40
+ %6 = arith.extui %5 : i1 to i64 loc(#loc15)
41
+ %7 = arith.muli %ks0, %6 : i64 loc(#loc15)
42
+ %8 = arith.extsi %3 : i32 to i64 loc(#loc16)
43
+ %9 = arith.addi %8, %7 : i64 loc(#loc16)
44
+ %10 = tt.splat %9 : i64 -> tensor<128xi64> loc(#loc17)
45
+ %11 = arith.muli %x1, %10 : tensor<128xi64> loc(#loc17)
46
+ %12 = arith.addi %x0_7, %11 : tensor<128xi64> loc(#loc18)
47
+ %13 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>> loc(#loc19)
48
+ %14 = tt.addptr %13, %12 : tensor<128x!tt.ptr<f32>>, tensor<128xi64> loc(#loc19)
49
+ tt.store %14, %tmp2_10, %xmask_5 : tensor<128x!tt.ptr<f32>> loc(#loc20)
50
+ tt.return loc(#loc21)
51
+ } loc(#loc)
52
+ tt.func private @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S128S_i64__(%a: tensor<128xi32> loc("a"(#loc22)), %b: i64 loc("b"(#loc22))) -> tensor<128xi64> attributes {noinline = false} {
53
+ %quot = arith.extsi %a : tensor<128xi32> to tensor<128xi64> loc(#loc51)
54
+ %quot_0 = tt.splat %b : i64 -> tensor<128xi64> loc(#loc51)
55
+ %quot_1 = arith.divsi %quot, %quot_0 : tensor<128xi64> loc(#loc51)
56
+ %remainder = arith.extsi %a : tensor<128xi32> to tensor<128xi64> loc(#loc52)
57
+ %remainder_2 = tt.splat %b : i64 -> tensor<128xi64> loc(#loc52)
58
+ %remainder_3 = arith.remsi %remainder, %remainder_2 : tensor<128xi64> loc(#loc52)
59
+ %fixed = arith.constant 0 : i32 loc(#loc53)
60
+ %fixed_4 = arith.extsi %fixed : i32 to i64 loc(#loc53)
61
+ %fixed_5 = tt.splat %fixed_4 : i64 -> tensor<128xi64> loc(#loc53)
62
+ %fixed_6 = arith.cmpi ne, %remainder_3, %fixed_5 : tensor<128xi64> loc(#loc53)
63
+ %fixed_7 = arith.constant 1 : i32 loc(#loc54)
64
+ %fixed_8 = arith.constant 1 : i64 loc(#loc54)
65
+ %fixed_9 = arith.constant dense<1> : tensor<128xi64> loc(#loc54)
66
+ %fixed_10 = arith.subi %quot_1, %fixed_9 : tensor<128xi64> loc(#loc54)
67
+ %fixed_11 = arith.select %fixed_6, %fixed_10, %quot_1 : tensor<128xi1>, tensor<128xi64> loc(#loc55)
68
+ %c0_i32 = arith.constant 0 : i32 loc(#loc28)
69
+ %cst = arith.constant dense<0> : tensor<128xi32> loc(#loc28)
70
+ %0 = arith.cmpi slt, %a, %cst : tensor<128xi32> loc(#loc28)
71
+ %c0_i32_12 = arith.constant 0 : i32 loc(#loc29)
72
+ %1 = arith.extsi %c0_i32_12 : i32 to i64 loc(#loc29)
73
+ %2 = arith.cmpi slt, %b, %1 : i64 loc(#loc29)
74
+ %3 = tt.splat %2 : i1 -> tensor<128xi1> loc(#loc30)
75
+ %4 = arith.cmpi ne, %0, %3 : tensor<128xi1> loc(#loc30)
76
+ %5 = arith.select %4, %fixed_11, %quot_1 : tensor<128xi1>, tensor<128xi64> loc(#loc31)
77
+ tt.return %5 : tensor<128xi64> loc(#loc32)
78
+ ^bb1: // no predecessors
79
+ %6 = ub.poison : tensor<128xi64> loc(#loc33)
80
+ tt.return %6 : tensor<128xi64> loc(#loc33)
81
+ } loc(#loc22)
82
+ } loc(#loc)
83
+ #loc1 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":19:28)
84
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":19:33)
85
+ #loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":20:36)
86
+ #loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":20:23)
87
+ #loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":21:21)
88
+ #loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":23:19)
89
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":24:51)
90
+ #loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":25:30)
91
+ #loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":25:35)
92
+ #loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":26:11)
93
+ #loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":27:18)
94
+ #loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:49)
95
+ #loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:41)
96
+ #loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:75)
97
+ #loc15 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:66)
98
+ #loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:57)
99
+ #loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:34)
100
+ #loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:30)
101
+ #loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:25)
102
+ #loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:88)
103
+ #loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:4)
104
+ #loc23 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
105
+ #loc24 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20)
106
+ #loc25 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
107
+ #loc26 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
108
+ #loc27 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
109
+ #loc28 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
110
+ #loc29 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
111
+ #loc30 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
112
+ #loc31 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
113
+ #loc32 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:11)
114
+ #loc33 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:4)
115
+ #loc38 = loc("xoffset"(#loc1))
116
+ #loc39 = loc("xoffset"(#loc2))
117
+ #loc40 = loc("xindex"(#loc3))
118
+ #loc41 = loc("xindex"(#loc4))
119
+ #loc42 = loc("xmask"(#loc5))
120
+ #loc43 = loc("x0"(#loc6))
121
+ #loc44 = loc("x1"(#loc7))
122
+ #loc45 = loc("tmp0"(#loc8))
123
+ #loc46 = loc("tmp0"(#loc9))
124
+ #loc47 = loc("tmp1"(#loc10))
125
+ #loc48 = loc("tmp2"(#loc11))
126
+ #loc51 = loc("quot"(#loc23))
127
+ #loc52 = loc("remainder"(#loc24))
128
+ #loc53 = loc("fixed"(#loc25))
129
+ #loc54 = loc("fixed"(#loc26))
130
+ #loc55 = loc("fixed"(#loc27))
progress/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttgir ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
2
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":18:0)
3
+ #loc30 = loc("in_ptr0"(#loc))
4
+ #loc31 = loc("out_ptr0"(#loc))
5
+ #loc32 = loc("ks0"(#loc))
6
+ #loc33 = loc("xnumel"(#loc))
7
+ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
8
+ tt.func public @triton_poi_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
9
+ %c128_i32 = arith.constant 128 : i32 loc(#loc1)
10
+ %cst = arith.constant dense<0.693147182> : tensor<128xf32, #blocked> loc(#loc1)
11
+ %c1_i64 = arith.constant 1 : i64 loc(#loc1)
12
+ %c0_i64 = arith.constant 0 : i64 loc(#loc1)
13
+ %cst_0 = arith.constant dense<0> : tensor<128xi64, #blocked> loc(#loc1)
14
+ %cst_1 = arith.constant dense<0> : tensor<128xi32, #blocked> loc(#loc1)
15
+ %cst_2 = arith.constant dense<1> : tensor<128xi64, #blocked> loc(#loc1)
16
+ %xoffset = tt.get_program_id x : i32 loc(#loc34)
17
+ %xoffset_3 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc35)
18
+ %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked> loc(#loc36)
19
+ %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<128xi32, #blocked> loc(#loc37)
20
+ %xindex_5 = arith.addi %xindex_4, %xindex : tensor<128xi32, #blocked> loc(#loc37)
21
+ %xmask = tt.splat %xnumel : i32 -> tensor<128xi32, #blocked> loc(#loc38)
22
+ %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<128xi32, #blocked> loc(#loc38)
23
+ %x0 = arith.extsi %xindex_5 : tensor<128xi32, #blocked> to tensor<128xi64, #blocked> loc(#loc39)
24
+ %x0_7 = tt.splat %ks0 : i64 -> tensor<128xi64, #blocked> loc(#loc39)
25
+ %x0_8 = arith.remsi %x0, %x0_7 : tensor<128xi64, #blocked> loc(#loc39)
26
+ %quot = arith.divsi %x0, %x0_7 : tensor<128xi64, #blocked> loc(#loc49)
27
+ %fixed = arith.cmpi ne, %x0_8, %cst_0 : tensor<128xi64, #blocked> loc(#loc50)
28
+ %fixed_9 = arith.subi %quot, %cst_2 : tensor<128xi64, #blocked> loc(#loc51)
29
+ %fixed_10 = arith.select %fixed, %fixed_9, %quot : tensor<128xi1, #blocked>, tensor<128xi64, #blocked> loc(#loc52)
30
+ %x1 = arith.cmpi slt, %xindex_5, %cst_1 : tensor<128xi32, #blocked> loc(#loc53)
31
+ %x1_11 = arith.cmpi slt, %ks0, %c0_i64 : i64 loc(#loc54)
32
+ %x1_12 = tt.splat %x1_11 : i1 -> tensor<128xi1, #blocked> loc(#loc55)
33
+ %x1_13 = arith.cmpi ne, %x1, %x1_12 : tensor<128xi1, #blocked> loc(#loc55)
34
+ %x1_14 = arith.select %x1_13, %fixed_10, %quot : tensor<128xi1, #blocked>, tensor<128xi64, #blocked> loc(#loc56)
35
+ %tmp0 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>, #blocked> loc(#loc45)
36
+ %tmp0_15 = tt.addptr %tmp0, %xindex_5 : tensor<128x!tt.ptr<f32>, #blocked>, tensor<128xi32, #blocked> loc(#loc45)
37
+ %tmp0_16 = tt.load %tmp0_15, %xmask_6 evictionPolicy = evict_last : tensor<128x!tt.ptr<f32>, #blocked> loc(#loc46)
38
+ %tmp2 = arith.mulf %tmp0_16, %cst : tensor<128xf32, #blocked> loc(#loc47)
39
+ %0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc20)
40
+ %1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc21)
41
+ %2 = arith.extui %1 : i1 to i64 loc(#loc22)
42
+ %3 = arith.muli %ks0, %2 : i64 loc(#loc22)
43
+ %4 = arith.extui %0 : i1 to i64 loc(#loc48)
44
+ %5 = arith.addi %4, %3 : i64 loc(#loc23)
45
+ %6 = tt.splat %5 : i64 -> tensor<128xi64, #blocked> loc(#loc25)
46
+ %7 = arith.muli %x1_14, %6 : tensor<128xi64, #blocked> loc(#loc25)
47
+ %8 = arith.addi %x0_8, %7 : tensor<128xi64, #blocked> loc(#loc26)
48
+ %9 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>, #blocked> loc(#loc27)
49
+ %10 = tt.addptr %9, %8 : tensor<128x!tt.ptr<f32>, #blocked>, tensor<128xi64, #blocked> loc(#loc27)
50
+ tt.store %10, %tmp2, %xmask_6 : tensor<128x!tt.ptr<f32>, #blocked> loc(#loc28)
51
+ tt.return loc(#loc29)
52
+ } loc(#loc)
53
+ } loc(#loc)
54
+ #loc1 = loc(unknown)
55
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":19:28)
56
+ #loc3 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":19:33)
57
+ #loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":20:36)
58
+ #loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":20:23)
59
+ #loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":21:21)
60
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":23:19)
61
+ #loc8 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
62
+ #loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":24:51)
63
+ #loc10 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
64
+ #loc11 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
65
+ #loc12 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
66
+ #loc13 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
67
+ #loc14 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
68
+ #loc15 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
69
+ #loc16 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
70
+ #loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":25:30)
71
+ #loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":25:35)
72
+ #loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":27:18)
73
+ #loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:49)
74
+ #loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:75)
75
+ #loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:66)
76
+ #loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:57)
77
+ #loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:41)
78
+ #loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:34)
79
+ #loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:30)
80
+ #loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:25)
81
+ #loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:88)
82
+ #loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:4)
83
+ #loc34 = loc("xoffset"(#loc2))
84
+ #loc35 = loc("xoffset"(#loc3))
85
+ #loc36 = loc("xindex"(#loc4))
86
+ #loc37 = loc("xindex"(#loc5))
87
+ #loc38 = loc("xmask"(#loc6))
88
+ #loc39 = loc("x0"(#loc7))
89
+ #loc40 = loc("quot"(#loc8))
90
+ #loc41 = loc("x1"(#loc9))
91
+ #loc42 = loc("fixed"(#loc10))
92
+ #loc43 = loc("fixed"(#loc11))
93
+ #loc44 = loc("fixed"(#loc12))
94
+ #loc45 = loc("tmp0"(#loc17))
95
+ #loc46 = loc("tmp0"(#loc18))
96
+ #loc47 = loc("tmp2"(#loc19))
97
+ #loc48 = loc(fused[#loc23, #loc24])
98
+ #loc49 = loc(callsite(#loc40 at #loc41))
99
+ #loc50 = loc(callsite(#loc42 at #loc41))
100
+ #loc51 = loc(callsite(#loc43 at #loc41))
101
+ #loc52 = loc(callsite(#loc44 at #loc41))
102
+ #loc53 = loc(callsite(#loc13 at #loc41))
103
+ #loc54 = loc(callsite(#loc14 at #loc41))
104
+ #loc55 = loc(callsite(#loc15 at #loc41))
105
+ #loc56 = loc(callsite(#loc16 at #loc41))
progress/SpecForge/cache/compiled_kernels/triton/3/2ZIFGDABR2MKMG7ESWF67GBZDP27JEZIQWMBXPOUZFGMG5PW5DSA/triton_poi_fused_mul_1.ttir ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":18:0)
2
+ #loc30 = loc("in_ptr0"(#loc))
3
+ #loc31 = loc("out_ptr0"(#loc))
4
+ #loc32 = loc("ks0"(#loc))
5
+ #loc33 = loc("xnumel"(#loc))
6
+ module {
7
+ tt.func public @triton_poi_fused_mul_1(%in_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
8
+ %fixed = arith.constant dense<1> : tensor<128xi64> loc(#loc49)
9
+ %x1 = arith.constant dense<0> : tensor<128xi32> loc(#loc50)
10
+ %fixed_0 = arith.constant dense<0> : tensor<128xi64> loc(#loc51)
11
+ %x1_1 = arith.constant 0 : i64 loc(#loc52)
12
+ %c1_i64 = arith.constant 1 : i64 loc(#loc6)
13
+ %tmp2 = arith.constant dense<0.693147182> : tensor<128xf32> loc(#loc37)
14
+ %c128_i32 = arith.constant 128 : i32 loc(#loc6)
15
+ %xoffset = tt.get_program_id x : i32 loc(#loc38)
16
+ %xoffset_2 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc39)
17
+ %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc40)
18
+ %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<128xi32> loc(#loc41)
19
+ %xindex_4 = arith.addi %xindex_3, %xindex : tensor<128xi32> loc(#loc41)
20
+ %xmask = tt.splat %xnumel : i32 -> tensor<128xi32> loc(#loc42)
21
+ %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<128xi32> loc(#loc42)
22
+ %x0 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc43)
23
+ %x0_6 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc43)
24
+ %x0_7 = arith.remsi %x0, %x0_6 : tensor<128xi64> loc(#loc43)
25
+ %quot = arith.divsi %x0, %x0_6 : tensor<128xi64> loc(#loc53)
26
+ %fixed_8 = arith.cmpi ne, %x0_7, %fixed_0 : tensor<128xi64> loc(#loc51)
27
+ %fixed_9 = arith.subi %quot, %fixed : tensor<128xi64> loc(#loc49)
28
+ %fixed_10 = arith.select %fixed_8, %fixed_9, %quot : tensor<128xi1>, tensor<128xi64> loc(#loc54)
29
+ %x1_11 = arith.cmpi slt, %xindex_4, %x1 : tensor<128xi32> loc(#loc50)
30
+ %x1_12 = arith.cmpi slt, %ks0, %x1_1 : i64 loc(#loc52)
31
+ %x1_13 = tt.splat %x1_12 : i1 -> tensor<128xi1> loc(#loc55)
32
+ %x1_14 = arith.cmpi ne, %x1_11, %x1_13 : tensor<128xi1> loc(#loc55)
33
+ %x1_15 = arith.select %x1_14, %fixed_10, %quot : tensor<128xi1>, tensor<128xi64> loc(#loc56)
34
+ %tmp0 = tt.splat %in_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>> loc(#loc46)
35
+ %tmp0_16 = tt.addptr %tmp0, %xindex_4 : tensor<128x!tt.ptr<f32>>, tensor<128xi32> loc(#loc46)
36
+ %tmp0_17 = tt.load %tmp0_16, %xmask_5 evictionPolicy = evict_last : tensor<128x!tt.ptr<f32>> loc(#loc47)
37
+ %tmp2_18 = arith.mulf %tmp0_17, %tmp2 : tensor<128xf32> loc(#loc37)
38
+ %0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc20)
39
+ %1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc21)
40
+ %2 = arith.extui %1 : i1 to i64 loc(#loc22)
41
+ %3 = arith.muli %ks0, %2 : i64 loc(#loc22)
42
+ %4 = arith.extui %0 : i1 to i64 loc(#loc48)
43
+ %5 = arith.addi %4, %3 : i64 loc(#loc23)
44
+ %6 = tt.splat %5 : i64 -> tensor<128xi64> loc(#loc25)
45
+ %7 = arith.muli %x1_15, %6 : tensor<128xi64> loc(#loc25)
46
+ %8 = arith.addi %x0_7, %7 : tensor<128xi64> loc(#loc26)
47
+ %9 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<128x!tt.ptr<f32>> loc(#loc27)
48
+ %10 = tt.addptr %9, %8 : tensor<128x!tt.ptr<f32>>, tensor<128xi64> loc(#loc27)
49
+ tt.store %10, %tmp2_18, %xmask_5 : tensor<128x!tt.ptr<f32>> loc(#loc28)
50
+ tt.return loc(#loc29)
51
+ } loc(#loc)
52
+ } loc(#loc)
53
+ #loc1 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44)
54
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":24:51)
55
+ #loc3 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25)
56
+ #loc4 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34)
57
+ #loc5 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36)
58
+ #loc6 = loc(unknown)
59
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":27:18)
60
+ #loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":19:28)
61
+ #loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":19:33)
62
+ #loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":20:36)
63
+ #loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":20:23)
64
+ #loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":21:21)
65
+ #loc13 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":23:19)
66
+ #loc14 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16)
67
+ #loc15 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47)
68
+ #loc16 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32)
69
+ #loc17 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47)
70
+ #loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":25:30)
71
+ #loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":25:35)
72
+ #loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:49)
73
+ #loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:75)
74
+ #loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:66)
75
+ #loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:57)
76
+ #loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:41)
77
+ #loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:34)
78
+ #loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:30)
79
+ #loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:25)
80
+ #loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:88)
81
+ #loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/sz/csz4oyqya4trc5ll55zjge2jpbfnpn34ci462kngl55bi66jokcx.py":28:4)
82
+ #loc34 = loc("fixed"(#loc1))
83
+ #loc35 = loc("x1"(#loc2))
84
+ #loc36 = loc("fixed"(#loc4))
85
+ #loc37 = loc("tmp2"(#loc7))
86
+ #loc38 = loc("xoffset"(#loc8))
87
+ #loc39 = loc("xoffset"(#loc9))
88
+ #loc40 = loc("xindex"(#loc10))
89
+ #loc41 = loc("xindex"(#loc11))
90
+ #loc42 = loc("xmask"(#loc12))
91
+ #loc43 = loc("x0"(#loc13))
92
+ #loc44 = loc("quot"(#loc14))
93
+ #loc45 = loc("fixed"(#loc15))
94
+ #loc46 = loc("tmp0"(#loc18))
95
+ #loc47 = loc("tmp0"(#loc19))
96
+ #loc48 = loc(fused[#loc23, #loc24])
97
+ #loc49 = loc(callsite(#loc34 at #loc35))
98
+ #loc50 = loc(callsite(#loc3 at #loc35))
99
+ #loc51 = loc(callsite(#loc36 at #loc35))
100
+ #loc52 = loc(callsite(#loc5 at #loc35))
101
+ #loc53 = loc(callsite(#loc44 at #loc35))
102
+ #loc54 = loc(callsite(#loc45 at #loc35))
103
+ #loc55 = loc(callsite(#loc16 at #loc35))
104
+ #loc56 = loc(callsite(#loc17 at #loc35))
progress/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/__grp__triton_tem_fused_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_tem_fused_0.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.source", "triton_tem_fused_0.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ttir", "triton_tem_fused_0.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ttgir", "triton_tem_fused_0.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.llir", "triton_tem_fused_0.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ptx", "triton_tem_fused_0.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.cubin", "triton_tem_fused_0.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.json"}}
progress/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "ef65677dccb0fd1ca33e2efd85dd27b554735f6893116cae461084f5b56323fe", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 196608, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_tem_fused_0"}
progress/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.llir ADDED
The diff for this file is too large to render. See raw diff
 
progress/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ptx ADDED
The diff for this file is too large to render. See raw diff
 
progress/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.source ADDED
The diff for this file is too large to render. See raw diff
 
progress/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ttgir ADDED
The diff for this file is too large to render. See raw diff
 
progress/SpecForge/cache/compiled_kernels/triton/3/55SWO7OMWD6RZIZ6F36YLXJHWVKHGX3ISMIWZLSGCCCPLNLDEP7A/triton_tem_fused_0.ttir ADDED
@@ -0,0 +1,896 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #loc = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":18:0)
2
+ #loc1 = loc(unknown)
3
+ #loc2 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":200:41)
4
+ #loc66 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":568:16)
5
+ #loc111 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":449:51)
6
+ #loc123 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":462:34)
7
+ #loc167 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":235:45)
8
+ #loc193 = loc("arg_Q"(#loc))
9
+ #loc194 = loc("arg_K"(#loc))
10
+ #loc195 = loc("arg_V"(#loc))
11
+ #loc196 = loc("arg_M"(#loc))
12
+ #loc197 = loc("arg_L"(#loc))
13
+ #loc198 = loc("arg_KV_NUM_BLKS"(#loc))
14
+ #loc199 = loc("arg_KV_IDX"(#loc))
15
+ #loc200 = loc("arg_FULL_KV_NUM_BLKS"(#loc))
16
+ #loc201 = loc("arg_FULL_KV_IDX"(#loc))
17
+ #loc202 = loc("out_ptr0"(#loc))
18
+ #loc203 = loc("ks0"(#loc))
19
+ #loc204 = loc("ks1"(#loc))
20
+ #loc255 = loc(callsite(#loc66 at #loc2))
21
+ #loc296 = loc("m_ij"(#loc111))
22
+ #loc306 = loc("l_i"(#loc123))
23
+ #loc346 = loc(callsite(#loc66 at #loc167))
24
+ #loc406 = loc(callsite(#loc296 at #loc255))
25
+ #loc416 = loc(callsite(#loc306 at #loc255))
26
+ #loc435 = loc(callsite(#loc296 at #loc346))
27
+ #loc445 = loc(callsite(#loc306 at #loc346))
28
+ #loc465 = loc(callsite(#loc1 at #loc406))
29
+ #loc467 = loc(callsite(#loc1 at #loc416))
30
+ #loc495 = loc(callsite(#loc1 at #loc435))
31
+ #loc497 = loc(callsite(#loc1 at #loc445))
32
+ module {
33
+ tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_M: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("arg_M"(#loc)), %arg_L: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("arg_L"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr<i32> {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc))) attributes {noinline = false} {
34
+ %cst = arith.constant dense<0> : tensor<1x128xi64> loc(#loc1)
35
+ %cst_0 = arith.constant dense<1024> : tensor<64x1xi32> loc(#loc1)
36
+ %cst_1 = arith.constant dense<0.000000e+00> : tensor<64x128xbf16> loc(#loc1)
37
+ %cst_2 = arith.constant dense<16> : tensor<1x64xi32> loc(#loc205)
38
+ %cst_3 = arith.constant dense<16> : tensor<512x1xi32> loc(#loc205)
39
+ %cst_4 = arith.constant dense<0xFF800000> : tensor<512xf32> loc(#loc1)
40
+ %cst_5 = arith.constant dense<1.44269502> : tensor<512x64xf32> loc(#loc1)
41
+ %cst_6 = arith.constant dense<false> : tensor<512x64xi1> loc(#loc205)
42
+ %cst_7 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc205)
43
+ %cst_8 = arith.constant dense<1> : tensor<512x1xi32> loc(#loc205)
44
+ %cst_9 = arith.constant dense<0> : tensor<512x1xi32> loc(#loc205)
45
+ %cst_10 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc205)
46
+ %cst_11 = arith.constant dense<0xFF800000> : tensor<512x64xf32> loc(#loc1)
47
+ %cst_12 = arith.constant dense<0.0883883461> : tensor<512x64xf32> loc(#loc1)
48
+ %cst_13 = arith.constant dense<0.000000e+00> : tensor<512x64xf32> loc(#loc1)
49
+ %acc = arith.constant dense<0.000000e+00> : tensor<512x128xf32> loc(#loc360)
50
+ %cst_14 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc1)
51
+ %c63_i32 = arith.constant 63 : i32 loc(#loc1)
52
+ %c31_i32 = arith.constant 31 : i32 loc(#loc1)
53
+ %cst_15 = arith.constant dense<128> : tensor<1x128x1xi32> loc(#loc1)
54
+ %mask = arith.constant dense<128> : tensor<1x1x128xi32> loc(#loc207)
55
+ %c0_i32 = arith.constant 0 : i32 loc(#loc1)
56
+ %c2_i32 = arith.constant 2 : i32 loc(#loc1)
57
+ %q_range = arith.constant dense<4096> : tensor<1x128x1xi32> loc(#loc208)
58
+ %cst_16 = arith.constant dense<128> : tensor<4x1x1xi32> loc(#loc1)
59
+ %true = arith.constant true loc(#loc7)
60
+ %c64_i32 = arith.constant 64 : i32 loc(#loc1)
61
+ %c4_i32 = arith.constant 4 : i32 loc(#loc1)
62
+ %HKV = arith.constant 8 : i32 loc(#loc209)
63
+ %c32_i32 = arith.constant 32 : i32 loc(#loc1)
64
+ %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
65
+ %c1_i32 = arith.constant 1 : i32 loc(#loc1)
66
+ %c128_i32 = arith.constant 128 : i32 loc(#loc1)
67
+ %c512_i32 = arith.constant 512 : i32 loc(#loc9)
68
+ %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
69
+ %0 = arith.muli %ks0, %c4096_i32 : i32 loc(#loc10)
70
+ %1 = arith.muli %ks0, %c1024_i32 : i32 loc(#loc11)
71
+ %2 = arith.muli %ks0, %c32_i32 : i32 loc(#loc12)
72
+ %TILE_KV_OG = arith.addi %ks1, %c31_i32 : i32 loc(#loc361)
73
+ %TILE_KV_OG_17 = arith.divsi %TILE_KV_OG, %c32_i32 : i32 loc(#loc362)
74
+ %TILE_KV = arith.addi %TILE_KV_OG_17, %c63_i32 : i32 loc(#loc363)
75
+ %TILE_KV_18 = arith.divsi %TILE_KV, %c64_i32 : i32 loc(#loc364)
76
+ %TILE_KV_19 = arith.muli %TILE_KV_18, %c64_i32 : i32 loc(#loc212)
77
+ %3 = arith.divsi %TILE_KV_19, %c64_i32 : i32 loc(#loc18)
78
+ %off_z = tt.get_program_id x : i32 loc(#loc213)
79
+ %off_z_20 = arith.divsi %off_z, %HKV : i32 loc(#loc214)
80
+ %off_hkv = arith.remsi %off_z, %HKV : i32 loc(#loc215)
81
+ %off_t = tt.get_program_id y : i32 loc(#loc216)
82
+ %q_offset = arith.muli %off_z_20, %0 : i32 loc(#loc217)
83
+ %q_offset_21 = arith.muli %off_hkv, %c512_i32 : i32 loc(#loc218)
84
+ %q_offset_22 = arith.addi %q_offset, %q_offset_21 : i32 loc(#loc219)
85
+ %k_offset = arith.muli %off_hkv, %c128_i32 : i32 loc(#loc220)
86
+ %K = tt.addptr %arg_K, %k_offset : !tt.ptr<bf16>, i32 loc(#loc221)
87
+ %V = tt.addptr %arg_V, %k_offset : !tt.ptr<bf16>, i32 loc(#loc222)
88
+ tt.assert %true, "" : i1 loc(#loc7)
89
+ %off_g = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc223)
90
+ %off_m = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc224)
91
+ %offs_m = tt.expand_dims %off_m {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc225)
92
+ %offs_m_23 = tt.broadcast %offs_m : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc226)
93
+ %offs_m_24 = tt.reshape %offs_m_23 : tensor<4x128xi32> -> tensor<512xi32> loc(#loc365)
94
+ %block_n_start = arith.muli %off_t, %3 : i32 loc(#loc228)
95
+ %block_n_end = arith.addi %block_n_start, %3 : i32 loc(#loc229)
96
+ %q_range_25 = tt.expand_dims %off_g {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc230)
97
+ %q_range_26 = tt.expand_dims %q_range_25 {axis = 2 : i32} : tensor<4x1xi32> -> tensor<4x1x1xi32> loc(#loc230)
98
+ %q_range_27 = arith.muli %q_range_26, %cst_16 : tensor<4x1x1xi32> loc(#loc231)
99
+ %q_range_28 = tt.expand_dims %offs_m {axis = 2 : i32} : tensor<1x128xi32> -> tensor<1x128x1xi32> loc(#loc232)
100
+ %q_range_29 = arith.muli %q_range_28, %q_range : tensor<1x128x1xi32> loc(#loc208)
101
+ %q_range_30 = tt.broadcast %q_range_27 : tensor<4x1x1xi32> -> tensor<4x128x1xi32> loc(#loc233)
102
+ %q_range_31 = tt.broadcast %q_range_29 : tensor<1x128x1xi32> -> tensor<4x128x1xi32> loc(#loc233)
103
+ %q_range_32 = arith.addi %q_range_30, %q_range_31 : tensor<4x128x1xi32> loc(#loc233)
104
+ %q_range_33 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<1x128xi32> -> tensor<1x1x128xi32> loc(#loc234)
105
+ %q_range_34 = tt.broadcast %q_range_32 : tensor<4x128x1xi32> -> tensor<4x128x128xi32> loc(#loc235)
106
+ %q_range_35 = tt.broadcast %q_range_33 : tensor<1x1x128xi32> -> tensor<4x128x128xi32> loc(#loc235)
107
+ %q_range_36 = arith.addi %q_range_34, %q_range_35 : tensor<4x128x128xi32> loc(#loc235)
108
+ %q = tt.splat %ks0 : i32 -> tensor<1x128x1xi32> loc(#loc236)
109
+ %q_37 = arith.cmpi slt, %q_range_28, %q : tensor<1x128x1xi32> loc(#loc236)
110
+ %q_38 = tt.addptr %arg_Q, %q_offset_22 : !tt.ptr<bf16>, i32 loc(#loc237)
111
+ %q_39 = tt.splat %q_38 : !tt.ptr<bf16> -> tensor<4x128x128x!tt.ptr<bf16>> loc(#loc238)
112
+ %q_40 = tt.addptr %q_39, %q_range_36 : tensor<4x128x128x!tt.ptr<bf16>>, tensor<4x128x128xi32> loc(#loc238)
113
+ %q_41 = tt.broadcast %q_37 : tensor<1x128x1xi1> -> tensor<4x128x128xi1> loc(#loc239)
114
+ %q_42 = tt.load %q_40, %q_41 : tensor<4x128x128x!tt.ptr<bf16>> loc(#loc239)
115
+ %q_43 = tt.reshape %q_42 : tensor<4x128x128xbf16> -> tensor<512x128xbf16> loc(#loc240)
116
+ %kv_num_blocks = tt.load %arg_KV_NUM_BLKS : !tt.ptr<i32> loc(#loc241)
117
+ %off_n_block_in_sparse = arith.remsi %block_n_start, %c2_i32 : i32 loc(#loc242)
118
+ %off_n = tt.load %arg_KV_IDX : !tt.ptr<i32> loc(#loc243)
119
+ %off_n_44 = arith.muli %off_n, %c128_i32 : i32 loc(#loc244)
120
+ %off_n_45 = arith.muli %off_n_block_in_sparse, %c64_i32 : i32 loc(#loc245)
121
+ %off_n_46 = arith.addi %off_n_44, %off_n_45 : i32 loc(#loc246)
122
+ %block_n_last_valid = arith.muli %kv_num_blocks, %c2_i32 : i32 loc(#loc247)
123
+ %block_n_last_valid_47 = arith.addi %ks1, %c63_i32 : i32 loc(#loc366)
124
+ %block_n_last_valid_48 = arith.divsi %block_n_last_valid_47, %c64_i32 : i32 loc(#loc367)
125
+ %block_n_last_valid_49 = arith.maxsi %block_n_last_valid_48, %c1_i32 : i32 loc(#loc249)
126
+ %block_n_last_valid_50 = arith.minsi %block_n_last_valid, %block_n_last_valid_49 : i32 loc(#loc250)
127
+ %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc251)
128
+ %offs_n_51 = tt.splat %off_n_46 : i32 -> tensor<64xi32> loc(#loc252)
129
+ %offs_n_52 = arith.addi %offs_n, %offs_n_51 : tensor<64xi32> loc(#loc252)
130
+ %4 = tt.expand_dims %offs_m_24 {axis = 1 : i32} : tensor<512xi32> -> tensor<512x1xi32> loc(#loc60)
131
+ %5 = tt.expand_dims %offs_n_52 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc61)
132
+ %6 = arith.cmpi sle, %block_n_end, %block_n_last_valid_50 : i32 loc(#loc62)
133
+ %7 = arith.select %6, %block_n_end, %block_n_last_valid_50 : i32 loc(#loc63)
134
+ %kv_offset:5 = scf.for %start_n = %block_n_start to %7 step %c1_i32 iter_args(%acc_78 = %acc, %l_i_79 = %cst_14, %m_i_80 = %cst_4, %offs_n_81 = %5, %kv_offset_82 = %c0_i32) -> (tensor<512x128xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x64xi32>, i32) : i32 {
135
+ %kv_base_offset = arith.addi %off_n_46, %kv_offset_82 : i32 loc(#loc369)
136
+ %offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc370)
137
+ %offs_n_load_83 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc370)
138
+ %ptr = tt.expand_dims %offs_n_load_83 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc456)
139
+ %ptr_84 = arith.muli %ptr, %cst_0 : tensor<64x1xi32> loc(#loc457)
140
+ %ptr_85 = tt.splat %K : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc458)
141
+ %ptr_86 = tt.addptr %ptr_85, %ptr_84 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc458)
142
+ %ptr_87 = tt.broadcast %ptr_86 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc459)
143
+ %ptr_88 = tt.broadcast %offs_m : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc459)
144
+ %ptr_89 = tt.addptr %ptr_87, %ptr_88 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc459)
145
+ %k = tt.splat %ks1 : i32 -> tensor<64x1xi32> loc(#loc460)
146
+ %k_90 = arith.cmpi slt, %ptr, %k : tensor<64x1xi32> loc(#loc460)
147
+ %k_91 = tt.broadcast %k_90 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc461)
148
+ %k_92 = tt.load %ptr_89, %k_91, %cst_1 : tensor<64x128x!tt.ptr<bf16>> loc(#loc461)
149
+ %k_93 = tt.trans %k_92 {order = array<i32: 1, 0>} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc372)
150
+ %qk = tt.dot %q_43, %k_93, %cst_13, inputPrecision = tf32 : tensor<512x128xbf16> * tensor<128x64xbf16> -> tensor<512x64xf32> loc(#loc373)
151
+ %qk_94 = arith.mulf %qk, %cst_12 : tensor<512x64xf32> loc(#loc374)
152
+ %m = tt.splat %ks0 : i32 -> tensor<512x1xi32> loc(#loc462)
153
+ %m_95 = arith.remsi %4, %m : tensor<512x1xi32> loc(#loc462)
154
+ %n = tt.splat %ks1 : i32 -> tensor<1x64xi32> loc(#loc463)
155
+ %n_96 = arith.remsi %offs_n_81, %n : tensor<1x64xi32> loc(#loc463)
156
+ %post_mod_scores = arith.cmpi slt, %offs_n_81, %n : tensor<1x64xi32> loc(#loc377)
157
+ %post_mod_scores_97 = tt.broadcast %post_mod_scores : tensor<1x64xi1> -> tensor<512x64xi1> loc(#loc378)
158
+ %post_mod_scores_98 = arith.select %post_mod_scores_97, %qk_94, %cst_11 : tensor<512x64xi1>, tensor<512x64xf32> loc(#loc378)
159
+ %tmp3 = arith.cmpi slt, %m_95, %cst_9 : tensor<512x1xi32> loc(#loc379)
160
+ %tmp5 = tt.broadcast %n_96 : tensor<1x64xi32> -> tensor<512x64xi32> loc(#loc380)
161
+ %tmp5_99 = tt.broadcast %m_95 : tensor<512x1xi32> -> tensor<512x64xi32> loc(#loc380)
162
+ %tmp5_100 = arith.cmpi sle, %tmp5, %tmp5_99 : tensor<512x64xi32> loc(#loc380)
163
+ %tmp6 = tt.broadcast %tmp3 : tensor<512x1xi1> -> tensor<512x64xi1> loc(#loc381)
164
+ %tmp6_101 = arith.andi %tmp6, %tmp5_100 : tensor<512x64xi1> loc(#loc381)
165
+ %tmp7 = arith.cmpi sge, %m_95, %cst_9 : tensor<512x1xi32> loc(#loc382)
166
+ %tmp8 = arith.cmpi slt, %n_96, %cst_10 : tensor<1x64xi32> loc(#loc383)
167
+ %tmp9 = tt.broadcast %tmp7 : tensor<512x1xi1> -> tensor<512x64xi1> loc(#loc384)
168
+ %tmp9_102 = tt.broadcast %tmp8 : tensor<1x64xi1> -> tensor<512x64xi1> loc(#loc384)
169
+ %tmp9_103 = arith.andi %tmp9, %tmp9_102 : tensor<512x64xi1> loc(#loc384)
170
+ %tmp10 = arith.extui %tmp8 : tensor<1x64xi1> to tensor<1x64xi32> loc(#loc385)
171
+ %tmp10_104 = arith.cmpi eq, %tmp10, %cst_10 : tensor<1x64xi32> loc(#loc385)
172
+ %tmp11 = tt.broadcast %tmp10_104 : tensor<1x64xi1> -> tensor<512x64xi1> loc(#loc386)
173
+ %tmp11_105 = arith.andi %tmp9, %tmp11 : tensor<512x64xi1> loc(#loc386)
174
+ %tmp14 = arith.remsi %m_95, %cst_3 : tensor<512x1xi32> loc(#loc387)
175
+ %tmp14_106 = arith.cmpi ne, %tmp14, %cst_9 : tensor<512x1xi32> loc(#loc388)
176
+ %tmp14_107 = arith.divsi %m_95, %cst_3 : tensor<512x1xi32> loc(#loc389)
177
+ %tmp14_108 = arith.subi %tmp14_107, %cst_8 : tensor<512x1xi32> loc(#loc390)
178
+ %tmp14_109 = arith.select %tmp14_106, %tmp14_108, %tmp14_107 : tensor<512x1xi1>, tensor<512x1xi32> loc(#loc391)
179
+ %tmp14_110 = arith.select %tmp3, %tmp14_109, %tmp14_107 : tensor<512x1xi1>, tensor<512x1xi32> loc(#loc392)
180
+ %tmp16 = arith.remsi %n_96, %cst_2 : tensor<1x64xi32> loc(#loc393)
181
+ %tmp16_111 = arith.cmpi ne, %tmp16, %cst_10 : tensor<1x64xi32> loc(#loc394)
182
+ %tmp16_112 = arith.divsi %n_96, %cst_2 : tensor<1x64xi32> loc(#loc395)
183
+ %tmp16_113 = arith.subi %tmp16_112, %cst_7 : tensor<1x64xi32> loc(#loc396)
184
+ %tmp16_114 = arith.select %tmp16_111, %tmp16_113, %tmp16_112 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc397)
185
+ %tmp16_115 = arith.select %tmp8, %tmp16_114, %tmp16_112 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc398)
186
+ %tmp17 = tt.broadcast %tmp14_110 : tensor<512x1xi32> -> tensor<512x64xi32> loc(#loc399)
187
+ %tmp17_116 = tt.broadcast %tmp16_115 : tensor<1x64xi32> -> tensor<512x64xi32> loc(#loc399)
188
+ %tmp17_117 = arith.cmpi eq, %tmp17, %tmp17_116 : tensor<512x64xi32> loc(#loc399)
189
+ %tmp18 = arith.andi %tmp11_105, %tmp17_117 : tensor<512x64xi1> loc(#loc400)
190
+ %tmp19 = arith.ori %tmp9_103, %tmp18 : tensor<512x64xi1> loc(#loc401)
191
+ %tmp20 = arith.ori %tmp6_101, %tmp19 : tensor<512x64xi1> loc(#loc402)
192
+ %mask_mod_output = arith.select %post_mod_scores_97, %tmp20, %cst_6 : tensor<512x64xi1>, tensor<512x64xi1> loc(#loc403)
193
+ %post_mod_scores_118 = arith.select %mask_mod_output, %post_mod_scores_98, %cst_11 : tensor<512x64xi1>, tensor<512x64xf32> loc(#loc404)
194
+ %post_mod_scores_119 = arith.mulf %post_mod_scores_118, %cst_5 : tensor<512x64xf32> loc(#loc405)
195
+ %m_ij = "tt.reduce"(%post_mod_scores_119) <{axis = 1 : i32}> ({
196
+ ^bb0(%m_ij_152: f32 loc(callsite(#loc1 at #loc406)), %m_ij_153: f32 loc(callsite(#loc1 at #loc406))):
197
+ %m_ij_154 = arith.maxnumf %m_ij_152, %m_ij_153 : f32 loc(#loc519)
198
+ tt.reduce.return %m_ij_154 : f32 loc(#loc464)
199
+ }) : (tensor<512x64xf32>) -> tensor<512xf32> loc(#loc464)
200
+ %m_ij_120 = arith.maxnumf %m_i_80, %m_ij : tensor<512xf32> loc(#loc407)
201
+ %masked_out_rows = arith.cmpf oeq, %m_ij_120, %cst_4 : tensor<512xf32> loc(#loc408)
202
+ %m_ij_masked = arith.select %masked_out_rows, %cst_14, %m_ij_120 : tensor<512xi1>, tensor<512xf32> loc(#loc409)
203
+ %alpha = arith.subf %m_i_80, %m_ij_masked : tensor<512xf32> loc(#loc410)
204
+ %alpha_121 = math.exp2 %alpha : tensor<512xf32> loc(#loc411)
205
+ %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<512xf32> -> tensor<512x1xf32> loc(#loc412)
206
+ %p_122 = tt.broadcast %p : tensor<512x1xf32> -> tensor<512x64xf32> loc(#loc413)
207
+ %p_123 = arith.subf %post_mod_scores_119, %p_122 : tensor<512x64xf32> loc(#loc413)
208
+ %p_124 = math.exp2 %p_123 : tensor<512x64xf32> loc(#loc414)
209
+ %l_i_125 = arith.mulf %l_i_79, %alpha_121 : tensor<512xf32> loc(#loc415)
210
+ %l_i_126 = "tt.reduce"(%p_124) <{axis = 1 : i32}> ({
211
+ ^bb0(%l_i_152: f32 loc(callsite(#loc1 at #loc416)), %l_i_153: f32 loc(callsite(#loc1 at #loc416))):
212
+ %l_i_154 = arith.addf %l_i_152, %l_i_153 : f32 loc(#loc520)
213
+ tt.reduce.return %l_i_154 : f32 loc(#loc466)
214
+ }) : (tensor<512x64xf32>) -> tensor<512xf32> loc(#loc466)
215
+ %l_i_127 = arith.addf %l_i_125, %l_i_126 : tensor<512xf32> loc(#loc417)
216
+ %acc_128 = tt.expand_dims %alpha_121 {axis = 1 : i32} : tensor<512xf32> -> tensor<512x1xf32> loc(#loc418)
217
+ %acc_129 = tt.broadcast %acc_128 : tensor<512x1xf32> -> tensor<512x128xf32> loc(#loc419)
218
+ %acc_130 = arith.mulf %acc_78, %acc_129 : tensor<512x128xf32> loc(#loc419)
219
+ %ptr_131 = tt.splat %V : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc468)
220
+ %ptr_132 = tt.addptr %ptr_131, %ptr_84 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc468)
221
+ %ptr_133 = tt.broadcast %ptr_132 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc469)
222
+ %ptr_134 = tt.addptr %ptr_133, %ptr_88 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc469)
223
+ %v = tt.load %ptr_134, %k_91, %cst_1 : tensor<64x128x!tt.ptr<bf16>> loc(#loc470)
224
+ %acc_135 = arith.truncf %p_124 : tensor<512x64xf32> to tensor<512x64xbf16> loc(#loc421)
225
+ %acc_136 = tt.dot %acc_135, %v, %acc_130, inputPrecision = tf32 : tensor<512x64xbf16> * tensor<64x128xbf16> -> tensor<512x128xf32> loc(#loc422)
226
+ %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc471)
227
+ %cur_block = tt.addptr %arg_KV_IDX, %cur_block_idx : !tt.ptr<i32>, i32 loc(#loc472)
228
+ %cur_block_137 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc473)
229
+ %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc474)
230
+ %next_block_138 = arith.cmpi slt, %next_block, %kv_num_blocks : i32 loc(#loc475)
231
+ %next_block_139 = tt.addptr %cur_block, %c1_i32 : !tt.ptr<i32>, i32 loc(#loc476)
232
+ %next_block_140 = tt.load %next_block_139, %next_block_138 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc477)
233
+ %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc478)
234
+ %needs_jump_141 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc479)
235
+ %needs_jump_142 = arith.cmpi eq, %needs_jump_141, %c0_i32 : i32 loc(#loc480)
236
+ %jump_to_block = arith.subi %next_block_140, %cur_block_137 : i32 loc(#loc481)
237
+ %jump_to_block_143 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc482)
238
+ %jump_to_block_144 = arith.subi %jump_to_block_143, %c64_i32 : i32 loc(#loc483)
239
+ %offset = arith.extui %needs_jump_142 : i1 to i32 loc(#loc484)
240
+ %offset_145 = arith.muli %jump_to_block_144, %offset : i32 loc(#loc484)
241
+ %offset_146 = arith.subi %c1_i32, %offset : i32 loc(#loc485)
242
+ %offset_147 = arith.muli %offset_146, %c64_i32 : i32 loc(#loc486)
243
+ %offset_148 = arith.addi %offset_145, %offset_147 : i32 loc(#loc487)
244
+ %offs_n_149 = tt.splat %offset_148 : i32 -> tensor<1x64xi32> loc(#loc424)
245
+ %offs_n_150 = arith.addi %offs_n_81, %offs_n_149 : tensor<1x64xi32> loc(#loc424)
246
+ %kv_offset_151 = arith.addi %kv_offset_82, %offset_148 : i32 loc(#loc425)
247
+ scf.yield %acc_136, %l_i_127, %m_ij_120, %offs_n_150, %kv_offset_151 : tensor<512x128xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x64xi32>, i32 loc(#loc333)
248
+ } loc(#loc524)
249
+ %kv_num_blocks_53 = tt.load %arg_FULL_KV_NUM_BLKS : !tt.ptr<i32> loc(#loc334)
250
+ %block_n_start_54 = arith.subi %c31_i32, %off_t : i32 loc(#loc335)
251
+ %block_n_start_55 = arith.muli %block_n_start_54, %3 : i32 loc(#loc336)
252
+ %block_n_end_56 = arith.addi %block_n_start_55, %3 : i32 loc(#loc337)
253
+ %off_n_block_in_sparse_57 = arith.remsi %block_n_start_55, %c2_i32 : i32 loc(#loc338)
254
+ %off_n_58 = tt.load %arg_FULL_KV_IDX : !tt.ptr<i32> loc(#loc339)
255
+ %off_n_59 = arith.muli %off_n_58, %c128_i32 : i32 loc(#loc340)
256
+ %off_n_60 = arith.muli %off_n_block_in_sparse_57, %c64_i32 : i32 loc(#loc341)
257
+ %off_n_61 = arith.addi %off_n_59, %off_n_60 : i32 loc(#loc342)
258
+ %block_n_last_valid_62 = arith.muli %kv_num_blocks_53, %c2_i32 : i32 loc(#loc343)
259
+ %block_n_last_valid_63 = arith.minsi %block_n_last_valid_62, %block_n_last_valid_49 : i32 loc(#loc344)
260
+ %offs_n_64 = tt.splat %off_n_61 : i32 -> tensor<64xi32> loc(#loc345)
261
+ %offs_n_65 = arith.addi %offs_n, %offs_n_64 : tensor<64xi32> loc(#loc345)
262
+ %8 = tt.expand_dims %offs_n_65 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc164)
263
+ %9 = arith.cmpi sle, %block_n_end_56, %block_n_last_valid_63 : i32 loc(#loc165)
264
+ %10 = arith.select %9, %block_n_end_56, %block_n_last_valid_63 : i32 loc(#loc166)
265
+ %kv_offset_66:5 = scf.for %start_n = %block_n_start_55 to %10 step %c1_i32 iter_args(%acc_78 = %kv_offset#0, %l_i_79 = %kv_offset#1, %m_i_80 = %kv_offset#2, %offs_n_81 = %8, %kv_offset_82 = %c0_i32) -> (tensor<512x128xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x64xi32>, i32) : i32 {
266
+ %kv_base_offset = arith.addi %off_n_61, %kv_offset_82 : i32 loc(#loc426)
267
+ %offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc427)
268
+ %offs_n_load_83 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc427)
269
+ %ptr = tt.expand_dims %offs_n_load_83 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc488)
270
+ %ptr_84 = arith.muli %ptr, %cst_0 : tensor<64x1xi32> loc(#loc489)
271
+ %ptr_85 = tt.splat %K : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc490)
272
+ %ptr_86 = tt.addptr %ptr_85, %ptr_84 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc490)
273
+ %ptr_87 = tt.broadcast %ptr_86 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc491)
274
+ %ptr_88 = tt.broadcast %offs_m : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc491)
275
+ %ptr_89 = tt.addptr %ptr_87, %ptr_88 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc491)
276
+ %k = tt.splat %ks1 : i32 -> tensor<64x1xi32> loc(#loc492)
277
+ %k_90 = arith.cmpi slt, %ptr, %k : tensor<64x1xi32> loc(#loc492)
278
+ %k_91 = tt.broadcast %k_90 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc493)
279
+ %k_92 = tt.load %ptr_89, %k_91, %cst_1 : tensor<64x128x!tt.ptr<bf16>> loc(#loc493)
280
+ %k_93 = tt.trans %k_92 {order = array<i32: 1, 0>} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc429)
281
+ %qk = tt.dot %q_43, %k_93, %cst_13, inputPrecision = tf32 : tensor<512x128xbf16> * tensor<128x64xbf16> -> tensor<512x64xf32> loc(#loc430)
282
+ %qk_94 = arith.mulf %qk, %cst_12 : tensor<512x64xf32> loc(#loc431)
283
+ %post_mod_scores = tt.splat %ks1 : i32 -> tensor<1x64xi32> loc(#loc432)
284
+ %post_mod_scores_95 = arith.cmpi slt, %offs_n_81, %post_mod_scores : tensor<1x64xi32> loc(#loc432)
285
+ %post_mod_scores_96 = tt.broadcast %post_mod_scores_95 : tensor<1x64xi1> -> tensor<512x64xi1> loc(#loc433)
286
+ %post_mod_scores_97 = arith.select %post_mod_scores_96, %qk_94, %cst_11 : tensor<512x64xi1>, tensor<512x64xf32> loc(#loc433)
287
+ %post_mod_scores_98 = arith.mulf %post_mod_scores_97, %cst_5 : tensor<512x64xf32> loc(#loc434)
288
+ %m_ij = "tt.reduce"(%post_mod_scores_98) <{axis = 1 : i32}> ({
289
+ ^bb0(%m_ij_131: f32 loc(callsite(#loc1 at #loc435)), %m_ij_132: f32 loc(callsite(#loc1 at #loc435))):
290
+ %m_ij_133 = arith.maxnumf %m_ij_131, %m_ij_132 : f32 loc(#loc521)
291
+ tt.reduce.return %m_ij_133 : f32 loc(#loc494)
292
+ }) : (tensor<512x64xf32>) -> tensor<512xf32> loc(#loc494)
293
+ %m_ij_99 = arith.maxnumf %m_i_80, %m_ij : tensor<512xf32> loc(#loc436)
294
+ %masked_out_rows = arith.cmpf oeq, %m_ij_99, %cst_4 : tensor<512xf32> loc(#loc437)
295
+ %m_ij_masked = arith.select %masked_out_rows, %cst_14, %m_ij_99 : tensor<512xi1>, tensor<512xf32> loc(#loc438)
296
+ %alpha = arith.subf %m_i_80, %m_ij_masked : tensor<512xf32> loc(#loc439)
297
+ %alpha_100 = math.exp2 %alpha : tensor<512xf32> loc(#loc440)
298
+ %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<512xf32> -> tensor<512x1xf32> loc(#loc441)
299
+ %p_101 = tt.broadcast %p : tensor<512x1xf32> -> tensor<512x64xf32> loc(#loc442)
300
+ %p_102 = arith.subf %post_mod_scores_98, %p_101 : tensor<512x64xf32> loc(#loc442)
301
+ %p_103 = math.exp2 %p_102 : tensor<512x64xf32> loc(#loc443)
302
+ %l_i_104 = arith.mulf %l_i_79, %alpha_100 : tensor<512xf32> loc(#loc444)
303
+ %l_i_105 = "tt.reduce"(%p_103) <{axis = 1 : i32}> ({
304
+ ^bb0(%l_i_131: f32 loc(callsite(#loc1 at #loc445)), %l_i_132: f32 loc(callsite(#loc1 at #loc445))):
305
+ %l_i_133 = arith.addf %l_i_131, %l_i_132 : f32 loc(#loc522)
306
+ tt.reduce.return %l_i_133 : f32 loc(#loc496)
307
+ }) : (tensor<512x64xf32>) -> tensor<512xf32> loc(#loc496)
308
+ %l_i_106 = arith.addf %l_i_104, %l_i_105 : tensor<512xf32> loc(#loc446)
309
+ %acc_107 = tt.expand_dims %alpha_100 {axis = 1 : i32} : tensor<512xf32> -> tensor<512x1xf32> loc(#loc447)
310
+ %acc_108 = tt.broadcast %acc_107 : tensor<512x1xf32> -> tensor<512x128xf32> loc(#loc448)
311
+ %acc_109 = arith.mulf %acc_78, %acc_108 : tensor<512x128xf32> loc(#loc448)
312
+ %ptr_110 = tt.splat %V : !tt.ptr<bf16> -> tensor<64x1x!tt.ptr<bf16>> loc(#loc498)
313
+ %ptr_111 = tt.addptr %ptr_110, %ptr_84 : tensor<64x1x!tt.ptr<bf16>>, tensor<64x1xi32> loc(#loc498)
314
+ %ptr_112 = tt.broadcast %ptr_111 : tensor<64x1x!tt.ptr<bf16>> -> tensor<64x128x!tt.ptr<bf16>> loc(#loc499)
315
+ %ptr_113 = tt.addptr %ptr_112, %ptr_88 : tensor<64x128x!tt.ptr<bf16>>, tensor<64x128xi32> loc(#loc499)
316
+ %v = tt.load %ptr_113, %k_91, %cst_1 : tensor<64x128x!tt.ptr<bf16>> loc(#loc500)
317
+ %acc_114 = arith.truncf %p_103 : tensor<512x64xf32> to tensor<512x64xbf16> loc(#loc450)
318
+ %acc_115 = tt.dot %acc_114, %v, %acc_109, inputPrecision = tf32 : tensor<512x64xbf16> * tensor<64x128xbf16> -> tensor<512x128xf32> loc(#loc451)
319
+ %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc501)
320
+ %cur_block = tt.addptr %arg_FULL_KV_IDX, %cur_block_idx : !tt.ptr<i32>, i32 loc(#loc502)
321
+ %cur_block_116 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc503)
322
+ %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc504)
323
+ %next_block_117 = arith.cmpi slt, %next_block, %kv_num_blocks_53 : i32 loc(#loc505)
324
+ %next_block_118 = tt.addptr %cur_block, %c1_i32 : !tt.ptr<i32>, i32 loc(#loc506)
325
+ %next_block_119 = tt.load %next_block_118, %next_block_117 evictionPolicy = evict_last : !tt.ptr<i32> loc(#loc507)
326
+ %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc508)
327
+ %needs_jump_120 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc509)
328
+ %needs_jump_121 = arith.cmpi eq, %needs_jump_120, %c0_i32 : i32 loc(#loc510)
329
+ %jump_to_block = arith.subi %next_block_119, %cur_block_116 : i32 loc(#loc511)
330
+ %jump_to_block_122 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc512)
331
+ %jump_to_block_123 = arith.subi %jump_to_block_122, %c64_i32 : i32 loc(#loc513)
332
+ %offset = arith.extui %needs_jump_121 : i1 to i32 loc(#loc514)
333
+ %offset_124 = arith.muli %jump_to_block_123, %offset : i32 loc(#loc514)
334
+ %offset_125 = arith.subi %c1_i32, %offset : i32 loc(#loc515)
335
+ %offset_126 = arith.muli %offset_125, %c64_i32 : i32 loc(#loc516)
336
+ %offset_127 = arith.addi %offset_124, %offset_126 : i32 loc(#loc517)
337
+ %offs_n_128 = tt.splat %offset_127 : i32 -> tensor<1x64xi32> loc(#loc453)
338
+ %offs_n_129 = arith.addi %offs_n_81, %offs_n_128 : tensor<1x64xi32> loc(#loc453)
339
+ %kv_offset_130 = arith.addi %kv_offset_82, %offset_127 : i32 loc(#loc454)
340
+ scf.yield %acc_115, %l_i_106, %m_ij_99, %offs_n_129, %kv_offset_130 : tensor<512x128xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x64xi32>, i32 loc(#loc347)
341
+ } loc(#loc525)
342
+ %m_offset = arith.muli %off_t, %2 : i32 loc(#loc348)
343
+ %m_offset_67 = arith.muli %off_z_20, %1 : i32 loc(#loc349)
344
+ %m_offset_68 = arith.addi %m_offset, %m_offset_67 : i32 loc(#loc350)
345
+ %M_block_ptr = tt.addptr %arg_M, %m_offset_68 : !tt.ptr<f32>, i32 loc(#loc351)
346
+ %M_block_ptr_69 = arith.muli %off_hkv, %c4_i32 : i32 loc(#loc352)
347
+ %M_block_ptr_70 = arith.extsi %ks0 : i32 to i64 loc(#loc353)
348
+ %M_block_ptr_71 = arith.extsi %M_block_ptr_69 : i32 to i64 loc(#loc353)
349
+ %L_block_ptr = tt.addptr %arg_L, %m_offset_68 : !tt.ptr<f32>, i32 loc(#loc354)
350
+ %m_i = tt.reshape %kv_offset_66#2 : tensor<512xf32> -> tensor<4x128xf32> loc(#loc355)
351
+ %l_i = tt.reshape %kv_offset_66#1 : tensor<512xf32> -> tensor<4x128xf32> loc(#loc356)
352
+ %11 = tt.splat %M_block_ptr : !tt.ptr<f32> -> tensor<4x128x!tt.ptr<f32>> loc(#loc177)
353
+ %12 = tt.splat %M_block_ptr_71 : i64 -> tensor<4xi64> loc(#loc177)
354
+ %13 = arith.extsi %off_g : tensor<4xi32> to tensor<4xi64> loc(#loc177)
355
+ %14 = arith.addi %12, %13 : tensor<4xi64> loc(#loc177)
356
+ %15 = tt.expand_dims %14 {axis = 1 : i32} : tensor<4xi64> -> tensor<4x1xi64> loc(#loc177)
357
+ %16 = tt.splat %M_block_ptr_70 : i64 -> tensor<4x1xi64> loc(#loc177)
358
+ %17 = arith.muli %15, %16 : tensor<4x1xi64> loc(#loc177)
359
+ %18 = tt.broadcast %17 : tensor<4x1xi64> -> tensor<4x128xi64> loc(#loc177)
360
+ %19 = arith.extsi %off_m : tensor<128xi32> to tensor<128xi64> loc(#loc177)
361
+ %20 = tt.expand_dims %19 {axis = 0 : i32} : tensor<128xi64> -> tensor<1x128xi64> loc(#loc177)
362
+ %21 = tt.broadcast %20 : tensor<1x128xi64> -> tensor<4x128xi64> loc(#loc177)
363
+ %22 = arith.addi %18, %21 : tensor<4x128xi64> loc(#loc177)
364
+ %23 = tt.addptr %11, %22 : tensor<4x128x!tt.ptr<f32>>, tensor<4x128xi64> loc(#loc177)
365
+ %24 = arith.cmpi sge, %20, %cst : tensor<1x128xi64> loc(#loc177)
366
+ %25 = tt.splat %M_block_ptr_70 : i64 -> tensor<1x128xi64> loc(#loc177)
367
+ %26 = arith.cmpi slt, %20, %25 : tensor<1x128xi64> loc(#loc177)
368
+ %27 = arith.andi %24, %26 : tensor<1x128xi1> loc(#loc177)
369
+ %28 = tt.broadcast %27 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc177)
370
+ tt.store %23, %m_i, %28 : tensor<4x128x!tt.ptr<f32>> loc(#loc177)
371
+ %29 = tt.splat %L_block_ptr : !tt.ptr<f32> -> tensor<4x128x!tt.ptr<f32>> loc(#loc178)
372
+ %30 = tt.addptr %29, %22 : tensor<4x128x!tt.ptr<f32>>, tensor<4x128xi64> loc(#loc178)
373
+ tt.store %30, %l_i, %28 : tensor<4x128x!tt.ptr<f32>> loc(#loc178)
374
+ %idx_hq = tt.splat %M_block_ptr_69 : i32 -> tensor<4x1x1xi32> loc(#loc357)
375
+ %idx_hq_72 = arith.addi %idx_hq, %q_range_26 : tensor<4x1x1xi32> loc(#loc357)
376
+ %mask_73 = arith.cmpi slt, %q_range_33, %mask : tensor<1x1x128xi32> loc(#loc207)
377
+ %mask_74 = tt.broadcast %q_37 : tensor<1x128x1xi1> -> tensor<1x128x128xi1> loc(#loc358)
378
+ %mask_75 = tt.broadcast %mask_73 : tensor<1x1x128xi1> -> tensor<1x128x128xi1> loc(#loc358)
379
+ %mask_76 = arith.andi %mask_74, %mask_75 : tensor<1x128x128xi1> loc(#loc358)
380
+ %acc_77 = tt.reshape %kv_offset_66#0 : tensor<512x128xf32> -> tensor<4x128x128xf32> loc(#loc359)
381
+ %31 = arith.muli %q_range_28, %cst_15 : tensor<1x128x1xi32> loc(#loc182)
382
+ %32 = tt.broadcast %q_range_33 : tensor<1x1x128xi32> -> tensor<1x128x128xi32> loc(#loc183)
383
+ %33 = tt.broadcast %31 : tensor<1x128x1xi32> -> tensor<1x128x128xi32> loc(#loc183)
384
+ %34 = arith.addi %32, %33 : tensor<1x128x128xi32> loc(#loc183)
385
+ %35 = arith.muli %idx_hq_72, %cst_16 : tensor<4x1x1xi32> loc(#loc184)
386
+ %36 = tt.splat %ks0 : i32 -> tensor<4x1x1xi32> loc(#loc185)
387
+ %37 = arith.muli %35, %36 : tensor<4x1x1xi32> loc(#loc185)
388
+ %38 = tt.broadcast %34 : tensor<1x128x128xi32> -> tensor<4x128x128xi32> loc(#loc186)
389
+ %39 = tt.broadcast %37 : tensor<4x1x1xi32> -> tensor<4x128x128xi32> loc(#loc186)
390
+ %40 = arith.addi %38, %39 : tensor<4x128x128xi32> loc(#loc186)
391
+ %41 = arith.muli %off_t, %c4096_i32 : i32 loc(#loc187)
392
+ %42 = arith.muli %41, %ks0 : i32 loc(#loc188)
393
+ %43 = tt.splat %42 : i32 -> tensor<4x128x128xi32> loc(#loc189)
394
+ %44 = arith.addi %40, %43 : tensor<4x128x128xi32> loc(#loc189)
395
+ %45 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<4x128x128x!tt.ptr<f32>> loc(#loc190)
396
+ %46 = tt.addptr %45, %44 : tensor<4x128x128x!tt.ptr<f32>>, tensor<4x128x128xi32> loc(#loc190)
397
+ %47 = tt.broadcast %mask_76 : tensor<1x128x128xi1> -> tensor<4x128x128xi1> loc(#loc191)
398
+ tt.store %46, %acc_77, %47 : tensor<4x128x128x!tt.ptr<f32>> loc(#loc191)
399
+ tt.return loc(#loc192)
400
+ } loc(#loc)
401
+ } loc(#loc)
402
+ #loc3 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31)
403
+ #loc4 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":132:19)
404
+ #loc5 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":276:38)
405
+ #loc6 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":155:61)
406
+ #loc7 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":135:21)
407
+ #loc8 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":95:10)
408
+ #loc9 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":86:60)
409
+ #loc10 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":86:65)
410
+ #loc11 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":89:54)
411
+ #loc12 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":89:62)
412
+ #loc13 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22)
413
+ #loc14 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":104:33)
414
+ #loc15 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28)
415
+ #loc16 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":105:34)
416
+ #loc17 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":105:45)
417
+ #loc18 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":106:49)
418
+ #loc19 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":108:26)
419
+ #loc20 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":108:48)
420
+ #loc21 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":110:49)
421
+ #loc22 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":111:26)
422
+ #loc23 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":113:23)
423
+ #loc24 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":113:45)
424
+ #loc25 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":113:35)
425
+ #loc26 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":114:47)
426
+ #loc27 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":117:12)
427
+ #loc28 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":118:12)
428
+ #loc29 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":137:25)
429
+ #loc30 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":140:25)
430
+ #loc31 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":141:44)
431
+ #loc32 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":141:54)
432
+ #loc33 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":74:27)
433
+ #loc34 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":141:22)
434
+ #loc35 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":152:28)
435
+ #loc36 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":153:34)
436
+ #loc37 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":155:32)
437
+ #loc38 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":155:26)
438
+ #loc39 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":155:67)
439
+ #loc40 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":155:49)
440
+ #loc41 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":155:103)
441
+ #loc42 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":155:84)
442
+ #loc43 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":162:72)
443
+ #loc44 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":162:24)
444
+ #loc45 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":162:35)
445
+ #loc46 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":162:20)
446
+ #loc47 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":166:22)
447
+ #loc48 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":173:28)
448
+ #loc49 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":176:44)
449
+ #loc50 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":177:20)
450
+ #loc51 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":177:48)
451
+ #loc52 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":177:95)
452
+ #loc53 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":177:71)
453
+ #loc54 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":181:52)
454
+ #loc55 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":181:99)
455
+ #loc56 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":181:109)
456
+ #loc57 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":181:72)
457
+ #loc58 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":183:26)
458
+ #loc59 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":183:37)
459
+ #loc60 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":194:40)
460
+ #loc61 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":194:57)
461
+ #loc62 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":198:53)
462
+ #loc63 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":198:38)
463
+ #loc64 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":532:40)
464
+ #loc65 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":388:32)
465
+ #loc67 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":392:35)
466
+ #loc68 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":328:27)
467
+ #loc69 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":393:107)
468
+ #loc70 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":328:38)
469
+ #loc71 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":328:20)
470
+ #loc72 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":328:49)
471
+ #loc73 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":336:52)
472
+ #loc74 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":336:23)
473
+ #loc75 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":395:17)
474
+ #loc76 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":397:19)
475
+ #loc77 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":399:14)
476
+ #loc78 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":301:21)
477
+ #loc79 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":404:36)
478
+ #loc80 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":405:36)
479
+ #loc81 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":413:44)
480
+ #loc82 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":413:69)
481
+ #loc83 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":418:22)
482
+ #loc84 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":420:23)
483
+ #loc85 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":421:22)
484
+ #loc86 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":422:23)
485
+ #loc87 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":423:22)
486
+ #loc88 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":424:22)
487
+ #loc89 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":425:24)
488
+ #loc90 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":426:23)
489
+ #loc91 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":429:70)
490
+ #loc92 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":429:79)
491
+ #loc93 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":429:91)
492
+ #loc94 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":429:99)
493
+ #loc95 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":429:102)
494
+ #loc96 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":429:119)
495
+ #loc97 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":431:70)
496
+ #loc98 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":431:79)
497
+ #loc99 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":431:91)
498
+ #loc100 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":431:99)
499
+ #loc101 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":431:102)
500
+ #loc102 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":431:119)
501
+ #loc103 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":432:25)
502
+ #loc104 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":433:24)
503
+ #loc105 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":434:23)
504
+ #loc106 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":435:23)
505
+ #loc107 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":440:73)
506
+ #loc108 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":442:69)
507
+ #loc109 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":445:27)
508
+ #loc110 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40)
509
+ #loc112 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27)
510
+ #loc113 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":449:27)
511
+ #loc114 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":451:35)
512
+ #loc115 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":452:51)
513
+ #loc116 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":456:31)
514
+ #loc117 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":456:25)
515
+ #loc118 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":457:51)
516
+ #loc119 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":457:39)
517
+ #loc120 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":457:21)
518
+ #loc121 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":462:16)
519
+ #loc122 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36)
520
+ #loc124 = loc("/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15)
521
+ #loc125 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":462:24)
522
+ #loc126 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":464:22)
523
+ #loc127 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":464:16)
524
+ #loc128 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":467:107)
525
+ #loc129 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":468:22)
526
+ #loc130 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":468:44)
527
+ #loc131 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":291:33)
528
+ #loc132 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":575:63)
529
+ #loc133 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":292:38)
530
+ #loc134 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":292:24)
531
+ #loc135 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":293:109)
532
+ #loc136 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":293:113)
533
+ #loc137 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":293:55)
534
+ #loc138 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":293:25)
535
+ #loc139 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":294:30)
536
+ #loc140 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":294:35)
537
+ #loc141 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":294:60)
538
+ #loc142 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":295:34)
539
+ #loc143 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":295:48)
540
+ #loc144 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":295:63)
541
+ #loc145 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":296:29)
542
+ #loc146 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":296:47)
543
+ #loc147 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":296:61)
544
+ #loc148 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":296:42)
545
+ #loc149 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":578:26)
546
+ #loc150 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":579:21)
547
+ #loc151 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":579:8)
548
+ #loc152 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":210:32)
549
+ #loc153 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":212:44)
550
+ #loc154 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":212:49)
551
+ #loc155 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":213:38)
552
+ #loc156 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":215:48)
553
+ #loc157 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":216:24)
554
+ #loc158 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":216:52)
555
+ #loc159 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":216:99)
556
+ #loc160 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":216:75)
557
+ #loc161 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":219:56)
558
+ #loc162 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":219:76)
559
+ #loc163 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":221:41)
560
+ #loc164 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":229:61)
561
+ #loc165 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":233:57)
562
+ #loc166 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":233:42)
563
+ #loc168 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":239:23)
564
+ #loc169 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":239:43)
565
+ #loc170 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":239:35)
566
+ #loc171 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":243:17)
567
+ #loc172 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":246:25)
568
+ #loc173 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":248:8)
569
+ #loc174 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":251:17)
570
+ #loc175 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":260:25)
571
+ #loc176 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":261:25)
572
+ #loc177 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":266:30)
573
+ #loc178 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":267:30)
574
+ #loc179 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":272:25)
575
+ #loc180 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":276:30)
576
+ #loc181 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":277:41)
577
+ #loc182 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":279:53)
578
+ #loc183 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":279:49)
579
+ #loc184 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":279:65)
580
+ #loc185 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":279:72)
581
+ #loc186 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":279:61)
582
+ #loc187 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":279:83)
583
+ #loc188 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":279:89)
584
+ #loc189 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":279:78)
585
+ #loc190 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":279:25)
586
+ #loc191 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":279:112)
587
+ #loc192 = loc("/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/a7/ca7s7tp233627gzszjumpewmgfr3x27cno6rvah7rvjcudtqof37.py":279:4)
588
+ #loc205 = loc(callsite(#loc1 at #loc2))
589
+ #loc206 = loc("acc"(#loc4))
590
+ #loc207 = loc("mask"(#loc5))
591
+ #loc208 = loc("q_range"(#loc6))
592
+ #loc209 = loc("HKV"(#loc8))
593
+ #loc210 = loc("TILE_KV_OG"(#loc14))
594
+ #loc211 = loc("TILE_KV"(#loc16))
595
+ #loc212 = loc("TILE_KV"(#loc17))
596
+ #loc213 = loc("off_z"(#loc19))
597
+ #loc214 = loc("off_z"(#loc20))
598
+ #loc215 = loc("off_hkv"(#loc21))
599
+ #loc216 = loc("off_t"(#loc22))
600
+ #loc217 = loc("q_offset"(#loc23))
601
+ #loc218 = loc("q_offset"(#loc24))
602
+ #loc219 = loc("q_offset"(#loc25))
603
+ #loc220 = loc("k_offset"(#loc26))
604
+ #loc221 = loc("K"(#loc27))
605
+ #loc222 = loc("V"(#loc28))
606
+ #loc223 = loc("off_g"(#loc29))
607
+ #loc224 = loc("off_m"(#loc30))
608
+ #loc225 = loc("offs_m"(#loc31))
609
+ #loc226 = loc("offs_m"(#loc32))
610
+ #loc227 = loc("offs_m"(#loc34))
611
+ #loc228 = loc("block_n_start"(#loc35))
612
+ #loc229 = loc("block_n_end"(#loc36))
613
+ #loc230 = loc("q_range"(#loc37))
614
+ #loc231 = loc("q_range"(#loc38))
615
+ #loc232 = loc("q_range"(#loc39))
616
+ #loc233 = loc("q_range"(#loc40))
617
+ #loc234 = loc("q_range"(#loc41))
618
+ #loc235 = loc("q_range"(#loc42))
619
+ #loc236 = loc("q"(#loc43))
620
+ #loc237 = loc("q"(#loc44))
621
+ #loc238 = loc("q"(#loc45))
622
+ #loc239 = loc("q"(#loc46))
623
+ #loc240 = loc("q"(#loc47))
624
+ #loc241 = loc("kv_num_blocks"(#loc48))
625
+ #loc242 = loc("off_n_block_in_sparse"(#loc49))
626
+ #loc243 = loc("off_n"(#loc50))
627
+ #loc244 = loc("off_n"(#loc51))
628
+ #loc245 = loc("off_n"(#loc52))
629
+ #loc246 = loc("off_n"(#loc53))
630
+ #loc247 = loc("block_n_last_valid"(#loc54))
631
+ #loc248 = loc("block_n_last_valid"(#loc55))
632
+ #loc249 = loc("block_n_last_valid"(#loc56))
633
+ #loc250 = loc("block_n_last_valid"(#loc57))
634
+ #loc251 = loc("offs_n"(#loc58))
635
+ #loc252 = loc("offs_n"(#loc59))
636
+ #loc253 = loc("acc"(#loc64))
637
+ #loc254 = loc("kv_base_offset"(#loc65))
638
+ #loc256 = loc("offs_n_load"(#loc67))
639
+ #loc257 = loc("ptr"(#loc68))
640
+ #loc258 = loc("k"(#loc69))
641
+ #loc259 = loc("ptr"(#loc70))
642
+ #loc260 = loc("ptr"(#loc71))
643
+ #loc261 = loc("ptr"(#loc72))
644
+ #loc262 = loc("k"(#loc75))
645
+ #loc263 = loc("qk"(#loc76))
646
+ #loc264 = loc("qk"(#loc77))
647
+ #loc265 = loc("m"(#loc79))
648
+ #loc266 = loc("n"(#loc80))
649
+ #loc267 = loc("post_mod_scores"(#loc81))
650
+ #loc268 = loc("post_mod_scores"(#loc82))
651
+ #loc269 = loc("tmp3"(#loc83))
652
+ #loc270 = loc("tmp5"(#loc84))
653
+ #loc271 = loc("tmp6"(#loc85))
654
+ #loc272 = loc("tmp7"(#loc86))
655
+ #loc273 = loc("tmp8"(#loc87))
656
+ #loc274 = loc("tmp9"(#loc88))
657
+ #loc275 = loc("tmp10"(#loc89))
658
+ #loc276 = loc("tmp11"(#loc90))
659
+ #loc277 = loc("tmp14"(#loc91))
660
+ #loc278 = loc("tmp14"(#loc92))
661
+ #loc279 = loc("tmp14"(#loc93))
662
+ #loc280 = loc("tmp14"(#loc94))
663
+ #loc281 = loc("tmp14"(#loc95))
664
+ #loc282 = loc("tmp14"(#loc96))
665
+ #loc283 = loc("tmp16"(#loc97))
666
+ #loc284 = loc("tmp16"(#loc98))
667
+ #loc285 = loc("tmp16"(#loc99))
668
+ #loc286 = loc("tmp16"(#loc100))
669
+ #loc287 = loc("tmp16"(#loc101))
670
+ #loc288 = loc("tmp16"(#loc102))
671
+ #loc289 = loc("tmp17"(#loc103))
672
+ #loc290 = loc("tmp18"(#loc104))
673
+ #loc291 = loc("tmp19"(#loc105))
674
+ #loc292 = loc("tmp20"(#loc106))
675
+ #loc293 = loc("mask_mod_output"(#loc107))
676
+ #loc294 = loc("post_mod_scores"(#loc108))
677
+ #loc295 = loc("post_mod_scores"(#loc109))
678
+ #loc297 = loc("m_ij"(#loc113))
679
+ #loc298 = loc("masked_out_rows"(#loc114))
680
+ #loc299 = loc("m_ij_masked"(#loc115))
681
+ #loc300 = loc("alpha"(#loc116))
682
+ #loc301 = loc("alpha"(#loc117))
683
+ #loc302 = loc("p"(#loc118))
684
+ #loc303 = loc("p"(#loc119))
685
+ #loc304 = loc("p"(#loc120))
686
+ #loc305 = loc("l_i"(#loc121))
687
+ #loc307 = loc("l_i"(#loc125))
688
+ #loc308 = loc("acc"(#loc126))
689
+ #loc309 = loc("acc"(#loc127))
690
+ #loc310 = loc("v"(#loc128))
691
+ #loc311 = loc("acc"(#loc129))
692
+ #loc312 = loc("acc"(#loc130))
693
+ #loc313 = loc("cur_block_idx"(#loc131))
694
+ #loc314 = loc("offset"(#loc132))
695
+ #loc315 = loc("cur_block"(#loc133))
696
+ #loc316 = loc("cur_block"(#loc134))
697
+ #loc317 = loc("next_block"(#loc135))
698
+ #loc318 = loc("next_block"(#loc136))
699
+ #loc319 = loc("next_block"(#loc137))
700
+ #loc320 = loc("next_block"(#loc138))
701
+ #loc321 = loc("needs_jump"(#loc139))
702
+ #loc322 = loc("needs_jump"(#loc140))
703
+ #loc323 = loc("needs_jump"(#loc141))
704
+ #loc324 = loc("jump_to_block"(#loc142))
705
+ #loc325 = loc("jump_to_block"(#loc143))
706
+ #loc326 = loc("jump_to_block"(#loc144))
707
+ #loc327 = loc("offset"(#loc145))
708
+ #loc328 = loc("offset"(#loc146))
709
+ #loc329 = loc("offset"(#loc147))
710
+ #loc330 = loc("offset"(#loc148))
711
+ #loc331 = loc("offs_n"(#loc149))
712
+ #loc332 = loc("kv_offset"(#loc150))
713
+ #loc333 = loc(callsite(#loc151 at #loc2))
714
+ #loc334 = loc("kv_num_blocks"(#loc152))
715
+ #loc335 = loc("block_n_start"(#loc153))
716
+ #loc336 = loc("block_n_start"(#loc154))
717
+ #loc337 = loc("block_n_end"(#loc155))
718
+ #loc338 = loc("off_n_block_in_sparse"(#loc156))
719
+ #loc339 = loc("off_n"(#loc157))
720
+ #loc340 = loc("off_n"(#loc158))
721
+ #loc341 = loc("off_n"(#loc159))
722
+ #loc342 = loc("off_n"(#loc160))
723
+ #loc343 = loc("block_n_last_valid"(#loc161))
724
+ #loc344 = loc("block_n_last_valid"(#loc162))
725
+ #loc345 = loc("offs_n"(#loc163))
726
+ #loc347 = loc(callsite(#loc151 at #loc167))
727
+ #loc348 = loc("m_offset"(#loc168))
728
+ #loc349 = loc("m_offset"(#loc169))
729
+ #loc350 = loc("m_offset"(#loc170))
730
+ #loc351 = loc("M_block_ptr"(#loc171))
731
+ #loc352 = loc("M_block_ptr"(#loc172))
732
+ #loc353 = loc("M_block_ptr"(#loc173))
733
+ #loc354 = loc("L_block_ptr"(#loc174))
734
+ #loc355 = loc("m_i"(#loc175))
735
+ #loc356 = loc("l_i"(#loc176))
736
+ #loc357 = loc("idx_hq"(#loc179))
737
+ #loc358 = loc("mask"(#loc180))
738
+ #loc359 = loc("acc"(#loc181))
739
+ #loc360 = loc(callsite(#loc3 at #loc206))
740
+ #loc361 = loc(callsite(#loc13 at #loc210))
741
+ #loc362 = loc(callsite(#loc15 at #loc210))
742
+ #loc363 = loc(callsite(#loc13 at #loc211))
743
+ #loc364 = loc(callsite(#loc15 at #loc211))
744
+ #loc365 = loc(callsite(#loc33 at #loc227))
745
+ #loc366 = loc(callsite(#loc13 at #loc248))
746
+ #loc367 = loc(callsite(#loc15 at #loc248))
747
+ #loc368 = loc("l_i"(#loc253))
748
+ #loc369 = loc(callsite(#loc254 at #loc255))
749
+ #loc370 = loc(callsite(#loc256 at #loc255))
750
+ #loc371 = loc(callsite(#loc258 at #loc255))
751
+ #loc372 = loc(callsite(#loc262 at #loc255))
752
+ #loc373 = loc(callsite(#loc263 at #loc255))
753
+ #loc374 = loc(callsite(#loc264 at #loc255))
754
+ #loc375 = loc(callsite(#loc265 at #loc255))
755
+ #loc376 = loc(callsite(#loc266 at #loc255))
756
+ #loc377 = loc(callsite(#loc267 at #loc255))
757
+ #loc378 = loc(callsite(#loc268 at #loc255))
758
+ #loc379 = loc(callsite(#loc269 at #loc255))
759
+ #loc380 = loc(callsite(#loc270 at #loc255))
760
+ #loc381 = loc(callsite(#loc271 at #loc255))
761
+ #loc382 = loc(callsite(#loc272 at #loc255))
762
+ #loc383 = loc(callsite(#loc273 at #loc255))
763
+ #loc384 = loc(callsite(#loc274 at #loc255))
764
+ #loc385 = loc(callsite(#loc275 at #loc255))
765
+ #loc386 = loc(callsite(#loc276 at #loc255))
766
+ #loc387 = loc(callsite(#loc277 at #loc255))
767
+ #loc388 = loc(callsite(#loc278 at #loc255))
768
+ #loc389 = loc(callsite(#loc279 at #loc255))
769
+ #loc390 = loc(callsite(#loc280 at #loc255))
770
+ #loc391 = loc(callsite(#loc281 at #loc255))
771
+ #loc392 = loc(callsite(#loc282 at #loc255))
772
+ #loc393 = loc(callsite(#loc283 at #loc255))
773
+ #loc394 = loc(callsite(#loc284 at #loc255))
774
+ #loc395 = loc(callsite(#loc285 at #loc255))
775
+ #loc396 = loc(callsite(#loc286 at #loc255))
776
+ #loc397 = loc(callsite(#loc287 at #loc255))
777
+ #loc398 = loc(callsite(#loc288 at #loc255))
778
+ #loc399 = loc(callsite(#loc289 at #loc255))
779
+ #loc400 = loc(callsite(#loc290 at #loc255))
780
+ #loc401 = loc(callsite(#loc291 at #loc255))
781
+ #loc402 = loc(callsite(#loc292 at #loc255))
782
+ #loc403 = loc(callsite(#loc293 at #loc255))
783
+ #loc404 = loc(callsite(#loc294 at #loc255))
784
+ #loc405 = loc(callsite(#loc295 at #loc255))
785
+ #loc407 = loc(callsite(#loc297 at #loc255))
786
+ #loc408 = loc(callsite(#loc298 at #loc255))
787
+ #loc409 = loc(callsite(#loc299 at #loc255))
788
+ #loc410 = loc(callsite(#loc300 at #loc255))
789
+ #loc411 = loc(callsite(#loc301 at #loc255))
790
+ #loc412 = loc(callsite(#loc302 at #loc255))
791
+ #loc413 = loc(callsite(#loc303 at #loc255))
792
+ #loc414 = loc(callsite(#loc304 at #loc255))
793
+ #loc415 = loc(callsite(#loc305 at #loc255))
794
+ #loc417 = loc(callsite(#loc307 at #loc255))
795
+ #loc418 = loc(callsite(#loc308 at #loc255))
796
+ #loc419 = loc(callsite(#loc309 at #loc255))
797
+ #loc420 = loc(callsite(#loc310 at #loc255))
798
+ #loc421 = loc(callsite(#loc311 at #loc255))
799
+ #loc422 = loc(callsite(#loc312 at #loc255))
800
+ #loc423 = loc(callsite(#loc314 at #loc2))
801
+ #loc424 = loc(callsite(#loc331 at #loc2))
802
+ #loc425 = loc(callsite(#loc332 at #loc2))
803
+ #loc426 = loc(callsite(#loc254 at #loc346))
804
+ #loc427 = loc(callsite(#loc256 at #loc346))
805
+ #loc428 = loc(callsite(#loc258 at #loc346))
806
+ #loc429 = loc(callsite(#loc262 at #loc346))
807
+ #loc430 = loc(callsite(#loc263 at #loc346))
808
+ #loc431 = loc(callsite(#loc264 at #loc346))
809
+ #loc432 = loc(callsite(#loc267 at #loc346))
810
+ #loc433 = loc(callsite(#loc268 at #loc346))
811
+ #loc434 = loc(callsite(#loc295 at #loc346))
812
+ #loc436 = loc(callsite(#loc297 at #loc346))
813
+ #loc437 = loc(callsite(#loc298 at #loc346))
814
+ #loc438 = loc(callsite(#loc299 at #loc346))
815
+ #loc439 = loc(callsite(#loc300 at #loc346))
816
+ #loc440 = loc(callsite(#loc301 at #loc346))
817
+ #loc441 = loc(callsite(#loc302 at #loc346))
818
+ #loc442 = loc(callsite(#loc303 at #loc346))
819
+ #loc443 = loc(callsite(#loc304 at #loc346))
820
+ #loc444 = loc(callsite(#loc305 at #loc346))
821
+ #loc446 = loc(callsite(#loc307 at #loc346))
822
+ #loc447 = loc(callsite(#loc308 at #loc346))
823
+ #loc448 = loc(callsite(#loc309 at #loc346))
824
+ #loc449 = loc(callsite(#loc310 at #loc346))
825
+ #loc450 = loc(callsite(#loc311 at #loc346))
826
+ #loc451 = loc(callsite(#loc312 at #loc346))
827
+ #loc452 = loc(callsite(#loc314 at #loc167))
828
+ #loc453 = loc(callsite(#loc331 at #loc167))
829
+ #loc454 = loc(callsite(#loc332 at #loc167))
830
+ #loc455 = loc("m_i"(#loc368))
831
+ #loc456 = loc(callsite(#loc257 at #loc371))
832
+ #loc457 = loc(callsite(#loc259 at #loc371))
833
+ #loc458 = loc(callsite(#loc260 at #loc371))
834
+ #loc459 = loc(callsite(#loc261 at #loc371))
835
+ #loc460 = loc(callsite(#loc73 at #loc371))
836
+ #loc461 = loc(callsite(#loc74 at #loc371))
837
+ #loc462 = loc(callsite(#loc78 at #loc375))
838
+ #loc463 = loc(callsite(#loc78 at #loc376))
839
+ #loc464 = loc(callsite(#loc110 at #loc406))
840
+ #loc466 = loc(callsite(#loc122 at #loc416))
841
+ #loc468 = loc(callsite(#loc260 at #loc420))
842
+ #loc469 = loc(callsite(#loc261 at #loc420))
843
+ #loc470 = loc(callsite(#loc74 at #loc420))
844
+ #loc471 = loc(callsite(#loc313 at #loc423))
845
+ #loc472 = loc(callsite(#loc315 at #loc423))
846
+ #loc473 = loc(callsite(#loc316 at #loc423))
847
+ #loc474 = loc(callsite(#loc317 at #loc423))
848
+ #loc475 = loc(callsite(#loc318 at #loc423))
849
+ #loc476 = loc(callsite(#loc319 at #loc423))
850
+ #loc477 = loc(callsite(#loc320 at #loc423))
851
+ #loc478 = loc(callsite(#loc321 at #loc423))
852
+ #loc479 = loc(callsite(#loc322 at #loc423))
853
+ #loc480 = loc(callsite(#loc323 at #loc423))
854
+ #loc481 = loc(callsite(#loc324 at #loc423))
855
+ #loc482 = loc(callsite(#loc325 at #loc423))
856
+ #loc483 = loc(callsite(#loc326 at #loc423))
857
+ #loc484 = loc(callsite(#loc327 at #loc423))
858
+ #loc485 = loc(callsite(#loc328 at #loc423))
859
+ #loc486 = loc(callsite(#loc329 at #loc423))
860
+ #loc487 = loc(callsite(#loc330 at #loc423))
861
+ #loc488 = loc(callsite(#loc257 at #loc428))
862
+ #loc489 = loc(callsite(#loc259 at #loc428))
863
+ #loc490 = loc(callsite(#loc260 at #loc428))
864
+ #loc491 = loc(callsite(#loc261 at #loc428))
865
+ #loc492 = loc(callsite(#loc73 at #loc428))
866
+ #loc493 = loc(callsite(#loc74 at #loc428))
867
+ #loc494 = loc(callsite(#loc110 at #loc435))
868
+ #loc496 = loc(callsite(#loc122 at #loc445))
869
+ #loc498 = loc(callsite(#loc260 at #loc449))
870
+ #loc499 = loc(callsite(#loc261 at #loc449))
871
+ #loc500 = loc(callsite(#loc74 at #loc449))
872
+ #loc501 = loc(callsite(#loc313 at #loc452))
873
+ #loc502 = loc(callsite(#loc315 at #loc452))
874
+ #loc503 = loc(callsite(#loc316 at #loc452))
875
+ #loc504 = loc(callsite(#loc317 at #loc452))
876
+ #loc505 = loc(callsite(#loc318 at #loc452))
877
+ #loc506 = loc(callsite(#loc319 at #loc452))
878
+ #loc507 = loc(callsite(#loc320 at #loc452))
879
+ #loc508 = loc(callsite(#loc321 at #loc452))
880
+ #loc509 = loc(callsite(#loc322 at #loc452))
881
+ #loc510 = loc(callsite(#loc323 at #loc452))
882
+ #loc511 = loc(callsite(#loc324 at #loc452))
883
+ #loc512 = loc(callsite(#loc325 at #loc452))
884
+ #loc513 = loc(callsite(#loc326 at #loc452))
885
+ #loc514 = loc(callsite(#loc327 at #loc452))
886
+ #loc515 = loc(callsite(#loc328 at #loc452))
887
+ #loc516 = loc(callsite(#loc329 at #loc452))
888
+ #loc517 = loc(callsite(#loc330 at #loc452))
889
+ #loc518 = loc("offs_n"(#loc455))
890
+ #loc519 = loc(callsite(#loc112 at #loc464))
891
+ #loc520 = loc(callsite(#loc124 at #loc466))
892
+ #loc521 = loc(callsite(#loc112 at #loc494))
893
+ #loc522 = loc(callsite(#loc124 at #loc496))
894
+ #loc523 = loc("kv_offset"(#loc518))
895
+ #loc524 = loc(callsite(#loc523 at #loc2))
896
+ #loc525 = loc(callsite(#loc523 at #loc167))
progress/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/__grp__triton_tem_fused_mul_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_tem_fused_mul_1.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.source", "triton_tem_fused_mul_1.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.ttir", "triton_tem_fused_mul_1.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.ttgir", "triton_tem_fused_mul_1.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.llir", "triton_tem_fused_mul_1.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.ptx", "triton_tem_fused_mul_1.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.cubin", "triton_tem_fused_mul_1.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.json"}}
progress/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "e9189965502344ccf889732c148fe3546cbfbba32ff4ff89a591e4bb193cb58d", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 164864, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_tem_fused_mul_1"}
progress/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.llir ADDED
The diff for this file is too large to render. See raw diff
 
progress/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.ptx ADDED
The diff for this file is too large to render. See raw diff
 
progress/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.source ADDED
The diff for this file is too large to render. See raw diff
 
progress/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.ttgir ADDED
The diff for this file is too large to render. See raw diff
 
progress/SpecForge/cache/compiled_kernels/triton/3/5EMJSZKQENCMZ6EJOMWBJD7DKRWL7O5DF72P7CNFSHSLWGJ4WWGQ/triton_tem_fused_mul_1.ttir ADDED
The diff for this file is too large to render. See raw diff
 
progress/SpecForge/cache/compiled_kernels/triton/3/5GHJ3QO6IQW65MAR65YJDHQXMEEKX2KGGUQWS5XUVWXZI7RAC2YA/__grp__triton_per_fused_2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"child_paths": {"triton_per_fused_2.source": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5GHJ3QO6IQW65MAR65YJDHQXMEEKX2KGGUQWS5XUVWXZI7RAC2YA/triton_per_fused_2.source", "triton_per_fused_2.ttir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5GHJ3QO6IQW65MAR65YJDHQXMEEKX2KGGUQWS5XUVWXZI7RAC2YA/triton_per_fused_2.ttir", "triton_per_fused_2.ttgir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5GHJ3QO6IQW65MAR65YJDHQXMEEKX2KGGUQWS5XUVWXZI7RAC2YA/triton_per_fused_2.ttgir", "triton_per_fused_2.llir": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5GHJ3QO6IQW65MAR65YJDHQXMEEKX2KGGUQWS5XUVWXZI7RAC2YA/triton_per_fused_2.llir", "triton_per_fused_2.ptx": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5GHJ3QO6IQW65MAR65YJDHQXMEEKX2KGGUQWS5XUVWXZI7RAC2YA/triton_per_fused_2.ptx", "triton_per_fused_2.cubin": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5GHJ3QO6IQW65MAR65YJDHQXMEEKX2KGGUQWS5XUVWXZI7RAC2YA/triton_per_fused_2.cubin", "triton_per_fused_2.json": "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/triton/3/5GHJ3QO6IQW65MAR65YJDHQXMEEKX2KGGUQWS5XUVWXZI7RAC2YA/triton_per_fused_2.json"}}
progress/SpecForge/cache/compiled_kernels/triton/3/5GHJ3QO6IQW65MAR65YJDHQXMEEKX2KGGUQWS5XUVWXZI7RAC2YA/triton_per_fused_2.cubin ADDED
Binary file (9.65 kB). View file
 
progress/SpecForge/cache/compiled_kernels/triton/3/5GHJ3QO6IQW65MAR65YJDHQXMEEKX2KGGUQWS5XUVWXZI7RAC2YA/triton_per_fused_2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hash": "e98e9dc1de442deeb011f770919e176108abe94635216976f4adaf947e2016b0", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused_2"}
progress/SpecForge/cache/compiled_kernels/triton/3/5GHJ3QO6IQW65MAR65YJDHQXMEEKX2KGGUQWS5XUVWXZI7RAC2YA/triton_per_fused_2.llir ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+ target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
4
+
5
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
6
+
7
+ ; Function Attrs: nounwind
8
+ define ptx_kernel void @triton_per_fused_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6, ptr addrspace(1) readnone captures(none) %7, ptr addrspace(1) readnone captures(none) %8) local_unnamed_addr #0 !dbg !5 {
9
+ %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8
10
+ %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
11
+ %12 = and i32 %11, 31, !dbg !9
12
+ %13 = lshr i32 %10, 7, !dbg !10
13
+ %14 = mul nuw nsw i32 %12, 196608, !dbg !11
14
+ %15 = add nuw i32 %14, %10, !dbg !12
15
+ %16 = sext i32 %15 to i64, !dbg !13
16
+ %17 = getelementptr float, ptr addrspace(1) %0, i64 %16, !dbg !13
17
+ %18 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %17) #5, !dbg !14
18
+ %19 = zext nneg i32 %13 to i64, !dbg !15
19
+ %20 = getelementptr float, ptr addrspace(1) %1, i64 %19, !dbg !15
20
+ %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !16
21
+ %22 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l"(ptr addrspace(1) %20, i64 %21) #5, !dbg !16
22
+ %23 = bitcast i32 %22 to float, !dbg !16
23
+ %24 = mul nuw nsw i32 %12, 1536, !dbg !17
24
+ %25 = add nuw nsw i32 %24, %13, !dbg !18
25
+ %26 = zext nneg i32 %25 to i64, !dbg !19
26
+ %27 = getelementptr float, ptr addrspace(1) %2, i64 %26, !dbg !19
27
+ %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20
28
+ %29 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l"(ptr addrspace(1) %27, i64 %28) #5, !dbg !20
29
+ %30 = bitcast i32 %29 to float, !dbg !20
30
+ %31 = getelementptr float, ptr addrspace(1) %3, i64 %19, !dbg !21
31
+ %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !22
32
+ %33 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l"(ptr addrspace(1) %31, i64 %32) #5, !dbg !22
33
+ %34 = fcmp oeq float %23, 0xFFF0000000000000, !dbg !23
34
+ %35 = fsub float %30, %23, !dbg !24
35
+ %36 = select i1 %34, float 0.000000e+00, float %35, !dbg !25
36
+ %37 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !26
37
+ %.not.i = icmp eq i32 %37, 0, !dbg !26
38
+ br i1 %.not.i, label %40, label %38, !dbg !26
39
+
40
+ 38: ; preds = %9
41
+ %39 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %36) #5, !dbg !26
42
+ br label %__nv_exp2f.exit, !dbg !26
43
+
44
+ 40: ; preds = %9
45
+ %41 = tail call float @llvm.nvvm.ex2.approx.f(float %36) #5, !dbg !26
46
+ br label %__nv_exp2f.exit, !dbg !26
47
+
48
+ __nv_exp2f.exit: ; preds = %38, %40
49
+ %.0.i = phi float [ %39, %38 ], [ %41, %40 ], !dbg !26
50
+ %42 = bitcast i32 %33 to float, !dbg !22
51
+ %43 = bitcast i32 %18 to float, !dbg !14
52
+ %44 = udiv i32 %10, 6144, !dbg !27
53
+ %45 = urem i32 %13, 48, !dbg !28
54
+ %46 = and i32 %10, 127, !dbg !29
55
+ %47 = fmul float %.0.i, %43, !dbg !30
56
+ %48 = bitcast float %47 to i32, !dbg !31
57
+ %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 16, i32 31), !dbg !31
58
+ %50 = bitcast i32 %49 to float, !dbg !31
59
+ %51 = fadd float %47, %50, !dbg !35
60
+ %52 = bitcast float %51 to i32, !dbg !31
61
+ %53 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %52, i32 8, i32 31), !dbg !31
62
+ %54 = bitcast i32 %53 to float, !dbg !31
63
+ %55 = fadd float %51, %54, !dbg !35
64
+ %56 = bitcast float %55 to i32, !dbg !31
65
+ %57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 4, i32 31), !dbg !31
66
+ %58 = bitcast i32 %57 to float, !dbg !31
67
+ %59 = fadd float %55, %58, !dbg !35
68
+ %60 = bitcast float %59 to i32, !dbg !31
69
+ %61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 2, i32 31), !dbg !31
70
+ %62 = bitcast i32 %61 to float, !dbg !31
71
+ %63 = fadd float %59, %62, !dbg !35
72
+ %64 = bitcast float %63 to i32, !dbg !31
73
+ %65 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 1, i32 31), !dbg !31
74
+ %66 = bitcast i32 %65 to float, !dbg !31
75
+ %67 = fadd float %63, %66, !dbg !35
76
+ %68 = select i1 %34, float 1.000000e+00, float %42, !dbg !36
77
+ %69 = tail call float @llvm.nvvm.div.full(float %67, float %68), !dbg !37
78
+ %70 = shl nuw nsw i32 %44, 7, !dbg !38
79
+ %71 = or disjoint i32 %70, %46, !dbg !39
80
+ %72 = shl nuw nsw i32 %45, 12, !dbg !40
81
+ %73 = add nuw nsw i32 %71, %72, !dbg !41
82
+ %74 = zext nneg i32 %73 to i64, !dbg !42
83
+ %75 = getelementptr bfloat, ptr addrspace(1) %4, i64 %74, !dbg !42
84
+ %76 = fptrunc float %69 to bfloat, !dbg !43
85
+ %77 = and i32 %11, 63, !dbg !43
86
+ %78 = icmp eq i32 %77, 0, !dbg !43
87
+ %79 = bitcast bfloat %76 to i16, !dbg !43
88
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %79, ptr addrspace(1) %75, i1 %78) #5, !dbg !43
89
+ ret void, !dbg !44
90
+ }
91
+
92
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
93
+ declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
94
+
95
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
96
+ declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
97
+
98
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
99
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
100
+
101
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
102
+ declare float @llvm.nvvm.div.full(float, float) #3
103
+
104
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
105
+
106
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
107
+ declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
108
+
109
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
110
+ declare float @llvm.nvvm.ex2.approx.f(float) #3
111
+
112
+ attributes #0 = { nounwind "nvvm.reqntid"="64" }
113
+ attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
114
+ attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
115
+ attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
116
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
117
+ attributes #5 = { nounwind }
118
+
119
+ !llvm.dbg.cu = !{!0}
120
+ !llvm.module.flags = !{!2, !3}
121
+ !llvm.ident = !{!4}
122
+
123
+ !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
124
+ !1 = !DIFile(filename: "ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py", directory: "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ee")
125
+ !2 = !{i32 2, !"Debug Info Version", i32 3}
126
+ !3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
127
+ !4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
128
+ !5 = distinct !DISubprogram(name: "triton_per_fused_2", linkageName: "triton_per_fused_2", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
129
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
130
+ !7 = !{}
131
+ !8 = !DILocation(line: 24, column: 28, scope: !5)
132
+ !9 = !DILocation(line: 27, column: 38, scope: !5)
133
+ !10 = !DILocation(line: 34, column: 19, scope: !5)
134
+ !11 = !DILocation(line: 38, column: 42, scope: !5)
135
+ !12 = !DILocation(line: 38, column: 35, scope: !5)
136
+ !13 = !DILocation(line: 38, column: 30, scope: !5)
137
+ !14 = !DILocation(line: 38, column: 49, scope: !5)
138
+ !15 = !DILocation(line: 39, column: 30, scope: !5)
139
+ !16 = !DILocation(line: 39, column: 35, scope: !5)
140
+ !17 = !DILocation(line: 40, column: 40, scope: !5)
141
+ !18 = !DILocation(line: 40, column: 35, scope: !5)
142
+ !19 = !DILocation(line: 40, column: 30, scope: !5)
143
+ !20 = !DILocation(line: 40, column: 47, scope: !5)
144
+ !21 = !DILocation(line: 41, column: 31, scope: !5)
145
+ !22 = !DILocation(line: 41, column: 36, scope: !5)
146
+ !23 = !DILocation(line: 43, column: 19, scope: !5)
147
+ !24 = !DILocation(line: 44, column: 18, scope: !5)
148
+ !25 = !DILocation(line: 46, column: 32, scope: !5)
149
+ !26 = !DILocation(line: 47, column: 26, scope: !5)
150
+ !27 = !DILocation(line: 37, column: 19, scope: !5)
151
+ !28 = !DILocation(line: 36, column: 28, scope: !5)
152
+ !29 = !DILocation(line: 35, column: 19, scope: !5)
153
+ !30 = !DILocation(line: 48, column: 18, scope: !5)
154
+ !31 = !DILocation(line: 291, column: 36, scope: !32, inlinedAt: !34)
155
+ !32 = distinct !DILexicalBlockFile(scope: !5, file: !33, discriminator: 0)
156
+ !33 = !DIFile(filename: "standard.py", directory: "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language")
157
+ !34 = !DILocation(line: 50, column: 26, scope: !5)
158
+ !35 = !DILocation(line: 261, column: 15, scope: !32, inlinedAt: !34)
159
+ !36 = !DILocation(line: 52, column: 34, scope: !5)
160
+ !37 = !DILocation(line: 53, column: 21, scope: !5)
161
+ !38 = !DILocation(line: 55, column: 34, scope: !5)
162
+ !39 = !DILocation(line: 55, column: 30, scope: !5)
163
+ !40 = !DILocation(line: 55, column: 44, scope: !5)
164
+ !41 = !DILocation(line: 55, column: 39, scope: !5)
165
+ !42 = !DILocation(line: 55, column: 25, scope: !5)
166
+ !43 = !DILocation(line: 55, column: 56, scope: !5)
167
+ !44 = !DILocation(line: 55, column: 4, scope: !5)
progress/SpecForge/cache/compiled_kernels/triton/3/5GHJ3QO6IQW65MAR65YJDHQXMEEKX2KGGUQWS5XUVWXZI7RAC2YA/triton_per_fused_2.ptx ADDED
@@ -0,0 +1,389 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.7
6
+ .target sm_90a
7
+ .address_size 64
8
+
9
+ // .globl triton_per_fused_2 // -- Begin function triton_per_fused_2
10
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
11
+ // @triton_per_fused_2
12
+ .visible .entry triton_per_fused_2(
13
+ .param .u64 .ptr .global .align 1 triton_per_fused_2_param_0,
14
+ .param .u64 .ptr .global .align 1 triton_per_fused_2_param_1,
15
+ .param .u64 .ptr .global .align 1 triton_per_fused_2_param_2,
16
+ .param .u64 .ptr .global .align 1 triton_per_fused_2_param_3,
17
+ .param .u64 .ptr .global .align 1 triton_per_fused_2_param_4,
18
+ .param .u32 triton_per_fused_2_param_5,
19
+ .param .u32 triton_per_fused_2_param_6,
20
+ .param .u64 .ptr .global .align 1 triton_per_fused_2_param_7,
21
+ .param .u64 .ptr .global .align 1 triton_per_fused_2_param_8
22
+ )
23
+ .reqntid 64
24
+ {
25
+ .reg .pred %p<3>;
26
+ .reg .b16 %rs<2>;
27
+ .reg .b32 %r<38>;
28
+ .reg .b64 %rd<18>;
29
+ .loc 1 18 0 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:18:0
30
+ $L__func_begin0:
31
+ .loc 1 18 0 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:18:0
32
+
33
+ // %bb.0: // %__nv_exp2f.exit
34
+ ld.param.b64 %rd12, [triton_per_fused_2_param_0];
35
+ ld.param.b64 %rd13, [triton_per_fused_2_param_1];
36
+ $L__tmp0:
37
+ .loc 1 24 28 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:24:28
38
+ mov.u32 %r5, %ctaid.x;
39
+ ld.param.b64 %rd14, [triton_per_fused_2_param_2];
40
+ .loc 1 27 38 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:27:38
41
+ mov.u32 %r6, %tid.x;
42
+ and.b32 %r7, %r6, 31;
43
+ ld.param.b64 %rd15, [triton_per_fused_2_param_3];
44
+ .loc 1 34 19 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:34:19
45
+ shr.u32 %r8, %r5, 7;
46
+ ld.param.b64 %rd16, [triton_per_fused_2_param_4];
47
+ .loc 1 38 35 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:38:35
48
+ mad.lo.s32 %r9, %r7, 196608, %r5;
49
+ .loc 1 38 30 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:38:30
50
+ mad.wide.s32 %rd1, %r9, 4, %rd12;
51
+ .loc 1 38 49 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:38:49
52
+ // begin inline asm
53
+ mov.u32 %r1, 0x0;
54
+ ld.global.b32 { %r1 }, [ %rd1 + 0 ];
55
+ // end inline asm
56
+ .loc 1 39 30 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:39:30
57
+ mul.wide.u32 %rd17, %r8, 4;
58
+ add.s64 %rd3, %rd13, %rd17;
59
+ .loc 1 39 35 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:39:35
60
+ // begin inline asm
61
+ mov.u64 %rd4, 0x0;
62
+ createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0;
63
+ // end inline asm
64
+ // begin inline asm
65
+ mov.u32 %r2, 0x0;
66
+ ld.global.L1::evict_last.L2::cache_hint.b32 { %r2 }, [ %rd3 + 0 ], %rd4;
67
+ // end inline asm
68
+ .loc 1 40 35 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:40:35
69
+ mad.lo.s32 %r10, %r7, 1536, %r8;
70
+ .loc 1 40 30 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:40:30
71
+ mad.wide.u32 %rd6, %r10, 4, %rd14;
72
+ .loc 1 40 47 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:40:47
73
+ // begin inline asm
74
+ mov.u64 %rd7, 0x0;
75
+ createpolicy.fractional.L2::evict_last.b64 %rd7, 1.0;
76
+ // end inline asm
77
+ // begin inline asm
78
+ mov.u32 %r3, 0x0;
79
+ ld.global.L1::evict_last.L2::cache_hint.b32 { %r3 }, [ %rd6 + 0 ], %rd7;
80
+ // end inline asm
81
+ .loc 1 41 31 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:41:31
82
+ add.s64 %rd9, %rd15, %rd17;
83
+ .loc 1 41 36 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:41:36
84
+ // begin inline asm
85
+ mov.u64 %rd10, 0x0;
86
+ createpolicy.fractional.L2::evict_last.b64 %rd10, 1.0;
87
+ // end inline asm
88
+ // begin inline asm
89
+ mov.u32 %r4, 0x0;
90
+ ld.global.L1::evict_last.L2::cache_hint.b32 { %r4 }, [ %rd9 + 0 ], %rd10;
91
+ // end inline asm
92
+ .loc 1 43 19 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:43:19
93
+ setp.eq.f32 %p2, %r2, 0fFF800000;
94
+ .loc 1 44 18 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:44:18
95
+ sub.f32 %r11, %r3, %r2;
96
+ .loc 1 46 32 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:46:32
97
+ selp.f32 %r12, 0f00000000, %r11, %p2;
98
+ .loc 1 47 26 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:47:26
99
+ ex2.approx.ftz.f32 %r13, %r12;
100
+ .loc 1 37 19 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:37:19
101
+ mul.hi.u32 %r14, %r5, 715827883;
102
+ .loc 1 36 28 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:36:28
103
+ mul.hi.u32 %r15, %r8, 89478486;
104
+ mul.lo.s32 %r16, %r15, 48;
105
+ sub.s32 %r17, %r8, %r16;
106
+ .loc 1 35 19 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:35:19
107
+ and.b32 %r18, %r5, 127;
108
+ .loc 1 48 18 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:48:18
109
+ mul.f32 %r19, %r13, %r1;
110
+ $L__tmp1:
111
+ .loc 2 291 36 // standard.py:291:36 @[ ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:50:26 ]
112
+ shfl.sync.bfly.b32 %r20, %r19, 16, 31, -1;
113
+ .loc 2 261 15 // standard.py:261:15 @[ ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:50:26 ]
114
+ fma.rn.f32 %r21, %r13, %r1, %r20;
115
+ .loc 2 291 36 // standard.py:291:36 @[ ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:50:26 ]
116
+ shfl.sync.bfly.b32 %r22, %r21, 8, 31, -1;
117
+ .loc 2 261 15 // standard.py:261:15 @[ ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:50:26 ]
118
+ add.f32 %r23, %r21, %r22;
119
+ .loc 2 291 36 // standard.py:291:36 @[ ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:50:26 ]
120
+ shfl.sync.bfly.b32 %r24, %r23, 4, 31, -1;
121
+ .loc 2 261 15 // standard.py:261:15 @[ ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:50:26 ]
122
+ add.f32 %r25, %r23, %r24;
123
+ .loc 2 291 36 // standard.py:291:36 @[ ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:50:26 ]
124
+ shfl.sync.bfly.b32 %r26, %r25, 2, 31, -1;
125
+ .loc 2 261 15 // standard.py:261:15 @[ ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:50:26 ]
126
+ add.f32 %r27, %r25, %r26;
127
+ .loc 2 291 36 // standard.py:291:36 @[ ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:50:26 ]
128
+ shfl.sync.bfly.b32 %r28, %r27, 1, 31, -1;
129
+ .loc 2 261 15 // standard.py:261:15 @[ ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:50:26 ]
130
+ add.f32 %r29, %r27, %r28;
131
+ $L__tmp2:
132
+ .loc 1 52 34 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:52:34
133
+ selp.f32 %r30, 0f3F800000, %r4, %p2;
134
+ .loc 1 53 21 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:53:21
135
+ div.full.f32 %r31, %r29, %r30;
136
+ .loc 1 55 34 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:55:34
137
+ shr.u32 %r32, %r14, 3;
138
+ and.b32 %r33, %r32, 67108736;
139
+ .loc 1 55 30 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:55:30
140
+ or.b32 %r34, %r33, %r18;
141
+ .loc 1 55 44 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:55:44
142
+ shl.b32 %r35, %r17, 12;
143
+ .loc 1 55 39 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:55:39
144
+ add.s32 %r36, %r34, %r35;
145
+ .loc 1 55 25 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:55:25
146
+ mad.wide.u32 %rd11, %r36, 2, %rd16;
147
+ .loc 1 55 56 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:55:56
148
+ cvt.rn.bf16.f32 %rs1, %r31;
149
+ and.b32 %r37, %r6, 63;
150
+ setp.eq.b32 %p1, %r37, 0;
151
+ // begin inline asm
152
+ @%p1 st.global.b16 [ %rd11 + 0 ], { %rs1 };
153
+ // end inline asm
154
+ .loc 1 55 4 // ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py:55:4
155
+ ret;
156
+ $L__tmp3:
157
+ $L__func_end0:
158
+ // -- End function
159
+ }
160
+ .file 1 "/workspace/hanrui/junquan/SpecForge/cache/compiled_kernels/ee/ceeui5pfegm2pcpkokzghtbznstx2p5iqb3n6wdv2ci43lzhlejq.py"
161
+ .file 2 "/workspace/hanrui/specforge/lib/python3.11/site-packages/triton/language/standard.py"
162
+ .section .debug_abbrev
163
+ {
164
+ .b8 1 // Abbreviation Code
165
+ .b8 17 // DW_TAG_compile_unit
166
+ .b8 1 // DW_CHILDREN_yes
167
+ .b8 37 // DW_AT_producer
168
+ .b8 8 // DW_FORM_string
169
+ .b8 19 // DW_AT_language
170
+ .b8 5 // DW_FORM_data2
171
+ .b8 3 // DW_AT_name
172
+ .b8 8 // DW_FORM_string
173
+ .b8 16 // DW_AT_stmt_list
174
+ .b8 6 // DW_FORM_data4
175
+ .b8 27 // DW_AT_comp_dir
176
+ .b8 8 // DW_FORM_string
177
+ .b8 0 // EOM(1)
178
+ .b8 0 // EOM(2)
179
+ .b8 2 // Abbreviation Code
180
+ .b8 46 // DW_TAG_subprogram
181
+ .b8 0 // DW_CHILDREN_no
182
+ .b8 3 // DW_AT_name
183
+ .b8 8 // DW_FORM_string
184
+ .b8 32 // DW_AT_inline
185
+ .b8 11 // DW_FORM_data1
186
+ .b8 0 // EOM(1)
187
+ .b8 0 // EOM(2)
188
+ .b8 3 // Abbreviation Code
189
+ .b8 46 // DW_TAG_subprogram
190
+ .b8 1 // DW_CHILDREN_yes
191
+ .b8 17 // DW_AT_low_pc
192
+ .b8 1 // DW_FORM_addr
193
+ .b8 18 // DW_AT_high_pc
194
+ .b8 1 // DW_FORM_addr
195
+ .b8 49 // DW_AT_abstract_origin
196
+ .b8 19 // DW_FORM_ref4
197
+ .b8 0 // EOM(1)
198
+ .b8 0 // EOM(2)
199
+ .b8 4 // Abbreviation Code
200
+ .b8 29 // DW_TAG_inlined_subroutine
201
+ .b8 0 // DW_CHILDREN_no
202
+ .b8 49 // DW_AT_abstract_origin
203
+ .b8 19 // DW_FORM_ref4
204
+ .b8 17 // DW_AT_low_pc
205
+ .b8 1 // DW_FORM_addr
206
+ .b8 18 // DW_AT_high_pc
207
+ .b8 1 // DW_FORM_addr
208
+ .b8 88 // DW_AT_call_file
209
+ .b8 11 // DW_FORM_data1
210
+ .b8 89 // DW_AT_call_line
211
+ .b8 11 // DW_FORM_data1
212
+ .b8 87 // DW_AT_call_column
213
+ .b8 11 // DW_FORM_data1
214
+ .b8 0 // EOM(1)
215
+ .b8 0 // EOM(2)
216
+ .b8 0 // EOM(3)
217
+ }
218
+ .section .debug_info
219
+ {
220
+ .b32 207 // Length of Unit
221
+ .b8 2 // DWARF version number
222
+ .b8 0
223
+ .b32 .debug_abbrev // Offset Into Abbrev. Section
224
+ .b8 8 // Address Size (in bytes)
225
+ .b8 1 // Abbrev [1] 0xb:0xc8 DW_TAG_compile_unit
226
+ .b8 116 // DW_AT_producer
227
+ .b8 114
228
+ .b8 105
229
+ .b8 116
230
+ .b8 111
231
+ .b8 110
232
+ .b8 0
233
+ .b8 2 // DW_AT_language
234
+ .b8 0
235
+ .b8 99 // DW_AT_name
236
+ .b8 101
237
+ .b8 101
238
+ .b8 117
239
+ .b8 105
240
+ .b8 53
241
+ .b8 112
242
+ .b8 102
243
+ .b8 101
244
+ .b8 103
245
+ .b8 109
246
+ .b8 50
247
+ .b8 112
248
+ .b8 99
249
+ .b8 112
250
+ .b8 107
251
+ .b8 111
252
+ .b8 107
253
+ .b8 122
254
+ .b8 103
255
+ .b8 104
256
+ .b8 116
257
+ .b8 98
258
+ .b8 122
259
+ .b8 110
260
+ .b8 115
261
+ .b8 116
262
+ .b8 120
263
+ .b8 50
264
+ .b8 112
265
+ .b8 53
266
+ .b8 105
267
+ .b8 113
268
+ .b8 98
269
+ .b8 51
270
+ .b8 110
271
+ .b8 54
272
+ .b8 119
273
+ .b8 100
274
+ .b8 118
275
+ .b8 50
276
+ .b8 99
277
+ .b8 105
278
+ .b8 52
279
+ .b8 51
280
+ .b8 108
281
+ .b8 122
282
+ .b8 104
283
+ .b8 108
284
+ .b8 101
285
+ .b8 106
286
+ .b8 113
287
+ .b8 46
288
+ .b8 112
289
+ .b8 121
290
+ .b8 0
291
+ .b32 .debug_line // DW_AT_stmt_list
292
+ .b8 47 // DW_AT_comp_dir
293
+ .b8 119
294
+ .b8 111
295
+ .b8 114
296
+ .b8 107
297
+ .b8 115
298
+ .b8 112
299
+ .b8 97
300
+ .b8 99
301
+ .b8 101
302
+ .b8 47
303
+ .b8 104
304
+ .b8 97
305
+ .b8 110
306
+ .b8 114
307
+ .b8 117
308
+ .b8 105
309
+ .b8 47
310
+ .b8 106
311
+ .b8 117
312
+ .b8 110
313
+ .b8 113
314
+ .b8 117
315
+ .b8 97
316
+ .b8 110
317
+ .b8 47
318
+ .b8 83
319
+ .b8 112
320
+ .b8 101
321
+ .b8 99
322
+ .b8 70
323
+ .b8 111
324
+ .b8 114
325
+ .b8 103
326
+ .b8 101
327
+ .b8 47
328
+ .b8 99
329
+ .b8 97
330
+ .b8 99
331
+ .b8 104
332
+ .b8 101
333
+ .b8 47
334
+ .b8 99
335
+ .b8 111
336
+ .b8 109
337
+ .b8 112
338
+ .b8 105
339
+ .b8 108
340
+ .b8 101
341
+ .b8 100
342
+ .b8 95
343
+ .b8 107
344
+ .b8 101
345
+ .b8 114
346
+ .b8 110
347
+ .b8 101
348
+ .b8 108
349
+ .b8 115
350
+ .b8 47
351
+ .b8 101
352
+ .b8 101
353
+ .b8 0
354
+ .b8 2 // Abbrev [2] 0x8f:0x15 DW_TAG_subprogram
355
+ .b8 116 // DW_AT_name
356
+ .b8 114
357
+ .b8 105
358
+ .b8 116
359
+ .b8 111
360
+ .b8 110
361
+ .b8 95
362
+ .b8 112
363
+ .b8 101
364
+ .b8 114
365
+ .b8 95
366
+ .b8 102
367
+ .b8 117
368
+ .b8 115
369
+ .b8 101
370
+ .b8 100
371
+ .b8 95
372
+ .b8 50
373
+ .b8 0
374
+ .b8 1 // DW_AT_inline
375
+ .b8 3 // Abbrev [3] 0xa4:0x2e DW_TAG_subprogram
376
+ .b64 $L__func_begin0 // DW_AT_low_pc
377
+ .b64 $L__func_end0 // DW_AT_high_pc
378
+ .b32 143 // DW_AT_abstract_origin
379
+ .b8 4 // Abbrev [4] 0xb9:0x18 DW_TAG_inlined_subroutine
380
+ .b32 143 // DW_AT_abstract_origin
381
+ .b64 $L__tmp1 // DW_AT_low_pc
382
+ .b64 $L__tmp2 // DW_AT_high_pc
383
+ .b8 1 // DW_AT_call_file
384
+ .b8 50 // DW_AT_call_line
385
+ .b8 26 // DW_AT_call_column
386
+ .b8 0 // End Of Children Mark
387
+ .b8 0 // End Of Children Mark
388
+ }
389
+ .section .debug_macinfo { }